Data Cleaning

In [2]:
import pandas as pd
import numpy as np 

In [3]:
#create a random data set with name, age, sex, tenure, job, province
# Set the number of records
num_records = 1000

# Generate random data for each variable
names = [f"Person {i}" for i in range(1, num_records + 1)]
ages = np.random.randint(18, 65, size=num_records)
sexes = np.random.choice(["Male", "Female"], size=num_records)
tenures = np.random.randint(0, 20, size=num_records)  # Tenure in years
jobs = np.random.choice(["Engineer", "Teacher", "Doctor", "Artist", "Salesperson"], size=num_records)
provinces = np.random.choice(["Ontario", "Quebec", "British Columbia", "Alberta", "Manitoba"], size=num_records)

# Create a DataFrame calle rd
rd = pd.DataFrame({
    "Name": names,
    "Age": ages,
    "Sex": sexes,
    "Tenure": tenures,
    "Job": jobs,
    "Province": provinces
})

# Introduce missing data randomly
for col in rd.columns:
    # Randomly select indices to set as missing
    missing_indices = np.random.choice(rd.index, size=int(num_records * 0.1), replace=False)
    rd.loc[missing_indices, col] = np.nan  # Set selected values to NaN

# Display the first 5 rows of the DataFrame
print(rd.head().to_markdown(index=False, numalign="left", stralign="left"))

| Name     | Age   | Sex    | Tenure   | Job         | Province         |
|:---------|:------|:-------|:---------|:------------|:-----------------|
| nan      | 37    | Male   | 1        | Teacher     | Manitoba         |
| Person 2 | 61    | Male   | 0        | Salesperson | Manitoba         |
| Person 3 | 40    | Female | 2        | Artist      | British Columbia |
| Person 4 | 31    | Male   | 14       | Doctor      | British Columbia |
| nan      | 43    | Female | nan      | Doctor      | British Columbia |


Identification of missing values

In [4]:
#Identify missing values by column
print(rd.isnull().sum())

Name        100
Age         100
Sex         100
Tenure      100
Job         100
Province    100
dtype: int64


In [5]:
#Indentify missing values by column a percentage
print(rd.isnull().sum() / len(rd) * 100)

Name        10.0
Age         10.0
Sex         10.0
Tenure      10.0
Job         10.0
Province    10.0
dtype: float64


Dropping missing values

In [9]:
#Drop rows with missing data
#first copy the df into a new df
rd2 = rd.copy()
rd2_dropped_rows = rd2.dropna()

In [None]:
#Missing Values are now null
print(rd2_dropped_rows.isnull().sum())

Name        0
Age         0
Sex         0
Tenure      0
Job         0
Province    0
dtype: int64


In [13]:
#speciy how many columns must be null before dropping
#example 3 columns must be null
rd_dropped = rd2.dropna(thresh= 3)

#none are dropped
print(rd_dropped.isnull().sum())

Name        100
Age         100
Sex         100
Tenure      100
Job         100
Province    100
dtype: int64


In [None]:
#Dropp Null Columns 
dropped_cols = rd2.dropna(axis=1)
dropped_cols

0
1
2
3
4
...
995
996
997
998
999


Imputation (Filing Missin Values)

In [None]:
#Fill using the mean of the column
#Can also use median and mode

#determine average age
average_age = rd2['Age'].mean()

#repalce missing value with mean value
#specify inplace to update the original df
rd2['Age'].fillna(average_age,inplace = True)

#age is no longer null
print(rd2.isnull().sum())


Name        100
Age           0
Sex         100
Tenure      100
Job         100
Province    100
dtype: int64


In [17]:
#Fill using a specified value

#replace null job with unknown
rd2['Job'].fillna('Unknown',inplace=True)

#Job is no longer null
print(rd2.isnull().sum())
rd2

Name        100
Age           0
Sex         100
Tenure      100
Job           0
Province    100
dtype: int64


Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,37.00,Male,1.0,Teacher,Manitoba
1,Person 2,61.00,Male,0.0,Salesperson,Manitoba
2,Person 3,40.00,Female,2.0,Artist,British Columbia
3,Person 4,31.00,Male,14.0,Doctor,British Columbia
4,,43.00,Female,,Doctor,British Columbia
...,...,...,...,...,...,...
995,Person 996,58.00,Female,3.0,Doctor,
996,,53.00,Male,17.0,Doctor,Manitoba
997,Person 998,41.47,Male,1.0,Teacher,Manitoba
998,Person 999,58.00,Female,12.0,Unknown,Quebec


In [20]:
#Forward Fill

#Fill tenure using the value from the next row 
rd2['Tenure'].ffill(inplace=True)

#tenure is no longer null
print(rd2.isnull().sum())
rd2

Name        100
Age           0
Sex         100
Tenure        0
Job           0
Province    100
dtype: int64


Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,37.00,Male,1.0,Teacher,Manitoba
1,Person 2,61.00,Male,0.0,Salesperson,Manitoba
2,Person 3,40.00,Female,2.0,Artist,British Columbia
3,Person 4,31.00,Male,14.0,Doctor,British Columbia
4,,43.00,Female,14.0,Doctor,British Columbia
...,...,...,...,...,...,...
995,Person 996,58.00,Female,3.0,Doctor,
996,,53.00,Male,17.0,Doctor,Manitoba
997,Person 998,41.47,Male,1.0,Teacher,Manitoba
998,Person 999,58.00,Female,12.0,Unknown,Quebec


In [21]:
#Backward fill

#Fill sex using the value from the previous row 
rd2['Sex'].bfill(inplace=True)

#sex is no longer null
print(rd2.isnull().sum())
rd2


Name        100
Age           0
Sex           0
Tenure        0
Job           0
Province    100
dtype: int64


Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,37.00,Male,1.0,Teacher,Manitoba
1,Person 2,61.00,Male,0.0,Salesperson,Manitoba
2,Person 3,40.00,Female,2.0,Artist,British Columbia
3,Person 4,31.00,Male,14.0,Doctor,British Columbia
4,,43.00,Female,14.0,Doctor,British Columbia
...,...,...,...,...,...,...
995,Person 996,58.00,Female,3.0,Doctor,
996,,53.00,Male,17.0,Doctor,Manitoba
997,Person 998,41.47,Male,1.0,Teacher,Manitoba
998,Person 999,58.00,Female,12.0,Unknown,Quebec


Handling Duplicates

In [28]:
#create a new df with dupes
rd3 = rd.copy()

rd4 = pd.concat([rd,rd3],ignore_index=True)
rd4

Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,37.0,Male,1.0,Teacher,Manitoba
1,Person 2,61.0,Male,0.0,Salesperson,Manitoba
2,Person 3,40.0,Female,2.0,Artist,British Columbia
3,Person 4,31.0,Male,14.0,Doctor,British Columbia
4,,43.0,Female,,Doctor,British Columbia
...,...,...,...,...,...,...
1995,Person 996,58.0,Female,3.0,Doctor,
1996,,53.0,Male,17.0,Doctor,Manitoba
1997,Person 998,,Male,1.0,Teacher,Manitoba
1998,Person 999,58.0,Female,12.0,,Quebec


In [None]:
#Identify Dupes
#Count duplicate rows
#A dupes is identified if all columns are the same
print(rd4.duplicated().sum())  

1000


In [None]:
#Show the dupe rows
print(rd4[rd4.duplicated()])

             Name   Age     Sex  Tenure          Job          Province
1000          NaN  37.0    Male     1.0      Teacher          Manitoba
1001     Person 2  61.0    Male     0.0  Salesperson          Manitoba
1002     Person 3  40.0  Female     2.0       Artist  British Columbia
1003     Person 4  31.0    Male    14.0       Doctor  British Columbia
1004          NaN  43.0  Female     NaN       Doctor  British Columbia
...           ...   ...     ...     ...          ...               ...
1995   Person 996  58.0  Female     3.0       Doctor               NaN
1996          NaN  53.0    Male    17.0       Doctor          Manitoba
1997   Person 998   NaN    Male     1.0      Teacher          Manitoba
1998   Person 999  58.0  Female    12.0          NaN            Quebec
1999  Person 1000  64.0  Female     3.0       Doctor            Quebec

[1000 rows x 6 columns]


In [32]:
#remove duplicates
rd4_no_dupes = rd4.drop_duplicates()

print(rd4_no_dupes.duplicated().sum()) 

0


Data Type Conversion

In [35]:
#Check data types
print(rd2.dtypes)

Name         object
Age         float64
Sex          object
Tenure      float64
Job          object
Province     object
dtype: object


In [37]:
#Convert Age to Int
#will throw errors when there are nulls, so using rd2
rd2['Age'] = rd2['Age'].astype(int)

print(rd2.dtypes)


Name         object
Age           int64
Sex          object
Tenure      float64
Job          object
Province     object
dtype: object


In [None]:
#Convert Sex to Categorical
#Use when there are few categories in string data
#helps improve performance
rd2['Sex'] = rd2['Sex'].astype('category')
print(rd2.dtypes)
rd2

Name          object
Age            int64
Sex         category
Tenure       float64
Job           object
Province      object
dtype: object


Unnamed: 0,Name,Age,Sex,Tenure,Job,Province
0,,37,Male,1.0,Teacher,Manitoba
1,Person 2,61,Male,0.0,Salesperson,Manitoba
2,Person 3,40,Female,2.0,Artist,British Columbia
3,Person 4,31,Male,14.0,Doctor,British Columbia
4,,43,Female,14.0,Doctor,British Columbia
...,...,...,...,...,...,...
995,Person 996,58,Female,3.0,Doctor,
996,,53,Male,17.0,Doctor,Manitoba
997,Person 998,41,Male,1.0,Teacher,Manitoba
998,Person 999,58,Female,12.0,Unknown,Quebec
