# Dealing with Data **Inconsistancies**

In [16]:
import pandas as pd

In [17]:
data=pd.DataFrame(data={
    'date':['2025-01-29','29-01-2025','2025/01/29','29-01-2025'],
    'country':['USA','U.S.A','America','United States'],
    'name':['Jonh Doe','John Doe','Jane Doe','Jane Doe'],
    'sales_2021':[100,200,None,200],
    'sales_2022':[None,150,300,150]
})
data.head()

Unnamed: 0,date,country,name,sales_2021,sales_2022
0,2025-01-29,USA,Jonh Doe,100.0,
1,29-01-2025,U.S.A,John Doe,200.0,150.0
2,2025/01/29,America,Jane Doe,,300.0
3,29-01-2025,United States,Jane Doe,200.0,150.0


In [18]:
# Standardizing the date formate
data.date=pd.to_datetime(data.date,errors='coerce')
data.date=data.date.dt.strftime('%Y-%m-%d')
data.head()

Unnamed: 0,date,country,name,sales_2021,sales_2022
0,2025-01-29,USA,Jonh Doe,100.0,
1,,U.S.A,John Doe,200.0,150.0
2,,America,Jane Doe,,300.0
3,,United States,Jane Doe,200.0,150.0


In [19]:
data.date=data['date'][0]
data.head()

Unnamed: 0,date,country,name,sales_2021,sales_2022
0,2025-01-29,USA,Jonh Doe,100.0,
1,2025-01-29,U.S.A,John Doe,200.0,150.0
2,2025-01-29,America,Jane Doe,,300.0
3,2025-01-29,United States,Jane Doe,200.0,150.0


In [20]:
# Harmonize the name of the country
country_mapping={'USA':'United States','U.S.A':'United States','America':'United States'}
data.country=data.country.replace(country_mapping)
data.head()

Unnamed: 0,date,country,name,sales_2021,sales_2022
0,2025-01-29,United States,Jonh Doe,100.0,
1,2025-01-29,United States,John Doe,200.0,150.0
2,2025-01-29,United States,Jane Doe,,300.0
3,2025-01-29,United States,Jane Doe,200.0,150.0


In [21]:
# Correct the typographical mistakes in name
data.name=data.name.replace({'Jonh Doe':'John Doe'})
data.head()

Unnamed: 0,date,country,name,sales_2021,sales_2022
0,2025-01-29,United States,John Doe,100.0,
1,2025-01-29,United States,John Doe,200.0,150.0
2,2025-01-29,United States,Jane Doe,,300.0
3,2025-01-29,United States,Jane Doe,200.0,150.0


In [22]:
# Resolve Contradictory/Logical Data
# let assume sales_2022 should always higher then sales_2021
data=data.drop(data[data['sales_2022']<=data['sales_2021']].index)
data.head()

Unnamed: 0,date,country,name,sales_2021,sales_2022
0,2025-01-29,United States,John Doe,100.0,
2,2025-01-29,United States,Jane Doe,,300.0


In [15]:
# remove duplicates
data=data.drop_duplicates(subset='name')
data.head()

Unnamed: 0,date,country,name,sales_2021,sales_2022
0,2025-01-29,United States,John Doe,100.0,
2,2025-01-29,United States,Jane Doe,,300.0
