This notebook is an exercise in data cleaning using Python and pandas.
I’ll work with a synthetic dataset (customers.csv) that contains messy, inconsistent data.
My goal is to identify and correct null values, adjust data types where needed, standardize column names and text fields, remove duplicates, and validate the final dataset to ensure it’s ready for analysis.

In [83]:
#import libraries
import pandas as pd
import numpy as np

#bring in file
customers=pd.read_csv('customers.csv')

#print first few rows
print(customers.head())


   customer_id         name              email   age country     signup_date  \
0            1   John Smith  user1@example.com   NaN  canada      10-03-2020   
1            2  Emily Clark  user2@example.com   NaN      uk      2021-01-05   
2            3   Zoe Brooks  user3@example.com  35.0      US      2021-01-05   
3            4     Alex Kim  user4@example.com  30.0  canada      03-15-2021   
4            5     Jane Doe                NaN  35.0     CAN  March 14, 2020   

           spend  
0         $99.99  
1            NaN  
2         $99.99  
3            200  
4  Three hundred  


In [84]:
#standardize column names and remove white space
customers.columns = customers.columns.str.lower().str.strip()

#see current datatypes
print(customers.dtypes)

customer_id      int64
name            object
email           object
age            float64
country         object
signup_date     object
spend           object
dtype: object


In [85]:
#convert signup date to datetime
customers['signup_date'] = pd.to_datetime(customers['signup_date'], errors='coerce')

#convert string value to number string, convert to float
customers['spend'] = customers['spend'].replace('Three hundred', '300')
customers['spend'] = (
    customers['spend']
    .replace('[$,]', '', regex=True)   # remove dollar signs/commas
    .astype(float)
)

customers.dtypes

customer_id             int64
name                   object
email                  object
age                   float64
country                object
signup_date    datetime64[ns]
spend                 float64
dtype: object

In [86]:
#handle missing values
customers['email'] = customers['email'].fillna('unknown')
customers['age'] = customers['age'].fillna(customers['age'].median())
customers['spend'] = customers['spend'].fillna(0)
customers['signup_date'] = customers['signup_date'].fillna(pd.to_datetime('2020-01-01'))

In [87]:
#standardize text data
customers['country'] = customers['country'].str.upper().replace({'USA': 'US', 'CANADA': 'CA', 'UNITED KINGDOM': 'UK'})
customers['email'] = customers['email'].str.lower()

customers['country'].unique()

customers.head()

Unnamed: 0,customer_id,name,email,age,country,signup_date,spend
0,1,John Smith,user1@example.com,30.0,CA,2020-10-03,99.99
1,2,Emily Clark,user2@example.com,30.0,UK,2020-01-01,0.0
2,3,Zoe Brooks,user3@example.com,35.0,US,2020-01-01,99.99
3,4,Alex Kim,user4@example.com,30.0,CA,2021-03-15,200.0
4,5,Jane Doe,unknown,35.0,CAN,2020-01-01,300.0


In [88]:
#drop dupes
customers = customers.drop_duplicates()

In [89]:
#final look at data
customers.describe()

Unnamed: 0,customer_id,age,signup_date,spend
count,24.0,24.0,24,24.0
mean,12.5,31.875,2020-06-16 06:00:00,335.415417
min,1.0,25.0,2020-01-01 00:00:00,0.0
25%,6.75,30.0,2020-01-01 00:00:00,174.9975
50%,12.5,30.0,2020-01-01 00:00:00,300.0
75%,18.25,35.0,2020-11-12 18:00:00,400.0
max,24.0,40.0,2021-03-15 00:00:00,1000.0
std,7.071068,4.618606,,293.584293


In [90]:
print(customers)

    customer_id           name               email   age country signup_date  \
0             1     John Smith   user1@example.com  30.0      CA  2020-10-03   
1             2    Emily Clark   user2@example.com  30.0      UK  2020-01-01   
2             3     Zoe Brooks   user3@example.com  35.0      US  2020-01-01   
3             4       Alex Kim   user4@example.com  30.0      CA  2021-03-15   
4             5       Jane Doe             unknown  35.0     CAN  2020-01-01   
5             6     Ryan Scott   user6@example.com  25.0      US  2020-10-03   
6             7    Chris Evans   user7@example.com  30.0      UK  2020-01-01   
7             8       Mary Lee   user8@example.com  30.0      UK  2021-03-15   
8             9    Emily Clark   user9@example.com  30.0      CA  2020-01-01   
9            10   Daniel Adams             unknown  30.0     CAN  2021-03-15   
10           11     John Smith  user11@example.com  35.0      CA  2020-01-01   
11           12    Laura Green  user12@e