In [178]:
import pandas as pd 
import seaborn as sns

In [179]:
data = {
    'date': ['2020-01-01', '01-02-2023', '01/03/2023', '01/04/2024', '05-01-2024'],
    'country': ['US', 'USA', 'America', 'United States', 'U.S.'],
    'name': ['Alle', 'dae', 'clara', 'Dave', 'allen'],
    'sales_2020': [100, 200, 300, 200, None],
    'sales_2021': [None, 220, None, 440, 220]
}
# create dataframe
df = pd.DataFrame(data)

In [180]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2020-01-01,US,Alle,100.0,
1,01-02-2023,USA,dae,200.0,220.0
2,01/03/2023,America,clara,300.0,
3,01/04/2024,United States,Dave,200.0,440.0
4,05-01-2024,U.S.,allen,,220.0


In [181]:
# removing the inconsistent data form date column
df['date'] = df['date'].str.replace('2020-01-01', '01-01-2020' )
df['date'] = df['date'].str.replace('/', '-' )

In [182]:
# changing the date format
df['date'] = pd.to_datetime(df['date'] , errors='coerce' , dayfirst=True)
df['date'] = df['date'].dt.strftime('%d-%m-%Y')
df

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,01-01-2020,US,Alle,100.0,
1,01-02-2023,USA,dae,200.0,220.0
2,01-03-2023,America,clara,300.0,
3,01-04-2024,United States,Dave,200.0,440.0
4,05-01-2024,U.S.,allen,,220.0


In [183]:
# checking the data types
df.dtypes

date           object
country        object
name           object
sales_2020    float64
sales_2021    float64
dtype: object

In [184]:
# changing the date column data type to datetime
df['date'] = df['date'].astype('datetime64[ns]')

In [185]:
# changed lets check
print(df.dtypes)
df

date          datetime64[ns]
country               object
name                  object
sales_2020           float64
sales_2021           float64
dtype: object


Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2020-01-01,US,Alle,100.0,
1,2023-01-02,USA,dae,200.0,220.0
2,2023-01-03,America,clara,300.0,
3,2024-01-04,United States,Dave,200.0,440.0
4,2024-05-01,U.S.,allen,,220.0


In [186]:
# Harmonize the name of country column
country_mapping = {
    'US': 'United States',
    'USA': 'United States',
    'America': 'United States',
    'U.S.': 'United States'
}
df['country'] = df['country'].replace(country_mapping)
df


Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2020-01-01,United States,Alle,100.0,
1,2023-01-02,United States,dae,200.0,220.0
2,2023-01-03,United States,clara,300.0,
3,2024-01-04,United States,Dave,200.0,440.0
4,2024-05-01,United States,allen,,220.0


In [None]:
# Correct the typograpical error in name column
df['name'] = df['name'].replace({'dae': 'Dave', 'Alle': 'allen'})
df


Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2020-01-01,United States,allen,100.0,
1,2023-01-02,United States,Dave,200.0,220.0
2,2023-01-03,United States,clara,300.0,
3,2024-01-04,United States,Dave,200.0,440.0
4,2024-05-01,United States,allen,,220.0


In [200]:
# lets capitalize the names 
df['name'] = df['name'].str.capitalize()
df

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2020-01-01,United States,Allen,100.0,
1,2023-01-02,United States,Dave,200.0,220.0
2,2023-01-03,United States,Clara,300.0,
3,2024-01-04,United States,Dave,200.0,440.0
4,2024-05-01,United States,Allen,,220.0


In [202]:
# lets  remove duplicates
df.drop_duplicates(subset='name',inplace=True)
df

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2020-01-01,United States,Allen,100.0,
1,2023-01-02,United States,Dave,200.0,220.0
2,2023-01-03,United States,Clara,300.0,
