In [2]:
# importing relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as px

In [3]:
# loading in the dataset 
rawdemodf = pd.read_parquet('../../data/raw/rawdemdata.parquet')

In [5]:
# checking that the formatting is correct
rawdemodf.sample(5)

Unnamed: 0_level_0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,employstatus
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
331183,B,3.0,52.0,Female,Non-Hispanic,Black or African American,English,Baptist,Married,Part Time
852,A,3.0,55.0,Female,Non-Hispanic,White or Caucasian,English,Protestant,Divorced,Full Time
416928,A,3.0,62.0,Female,Non-Hispanic,White or Caucasian,English,Protestant,Single,Disabled
324720,C,3.0,58.0,Male,Non-Hispanic,White or Caucasian,English,Catholic,Married,Self Employed
294383,B,3.0,57.0,Female,Non-Hispanic,Black or African American,English,Methodist,Married,Not Employed


Everything looks good, let's check the shape and do some cleaning!

In [56]:
# creating a copy of the dataframe that will be our cleaned data
cleandemodf = rawdemodf.copy()

In [57]:
# finding the shape of the dataframe
print(f'We have {cleandemodf.shape[0]} rows and {cleandemodf.shape[1]} columns.')

We have 560486 rows and 10 columns.


In [58]:
# checking for null values
cleandemodf.isna().sum()

dep_name              0
esi                2457
age                  11
gender                0
ethnicity             0
race                 26
lang                  0
religion         135198
maritalstatus         0
employstatus          0
dtype: int64

In [59]:
# printing percentage of values that are null for each column containing null values
pctnullesi = cleandemodf['esi'].isna().sum()/cleandemodf.shape[0] * 100
pctnullage = cleandemodf['age'].isna().value_counts()[1]/cleandemodf.shape[0] * 100
pctnullrace = cleandemodf['race'].isna().value_counts()[1]/cleandemodf.shape[0] * 100
pctnullreligion = cleandemodf['religion'].isna().value_counts()[1]/cleandemodf.shape[0] * 100
print(f'{round(pctnullesi, 3)} % of esi values are null \n {round(pctnullage, 3)} % of age values are null \n {round(pctnullrace, 3)} % of race values are null \n {round(pctnullreligion, 3)} % of religion values are null \n  ')

0.438 % of esi values are null 
 0.002 % of age values are null 
 0.005 % of race values are null 
 24.122 % of religion values are null 
  


The 'esi', 'age' and 'race' columns contain a very small percentage of null values, and we can drop rows with null values in these columns. The religion will require more attention, as a large percentage of values are null. 

In [60]:
# dropping rows with 'esi', 'age' and 'race' values
cleandemodf.dropna(axis=0, subset=['esi', 'age', 'race'], inplace=True)

In [61]:
# checking that it has worked as desired
cleandemodf.isna().sum()

dep_name              0
esi                   0
age                   0
gender                0
ethnicity             0
race                  0
lang                  0
religion         134468
maritalstatus         0
employstatus          0
dtype: int64

It has worked! Now let's look into the religion column. 

In [62]:
# seeing the possible values of the 'religion' column
cleandemodf['religion'].value_counts()

religion
Catholic             188200
Christian             75026
Baptist               40378
Other                 28811
Pentecostal           25503
Protestant            23250
Jewish                12139
Muslim                 8378
Episcopal              7974
Methodist              4799
Unknown                4605
Jehovah's Witness      4461
Name: count, dtype: int64

Great! There is an unknow option, so we will fill null religion values with 'unknown'. 

In [63]:
# filling null religion values with 'unknown'
cleandemodf['religion'] = cleandemodf['religion'].fillna('Unknown')

In [65]:
# checking the data type of the columns
cleandemodf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 557992 entries, 1 to 560486
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   dep_name       557992 non-null  object 
 1   esi            557992 non-null  float64
 2   age            557992 non-null  float64
 3   gender         557992 non-null  object 
 4   ethnicity      557992 non-null  object 
 5   race           557992 non-null  object 
 6   lang           557992 non-null  object 
 7   religion       557992 non-null  object 
 8   maritalstatus  557992 non-null  object 
 9   employstatus   557992 non-null  object 
dtypes: float64(2), object(8)
memory usage: 46.8+ MB


These all look good, except that 'esi score should be a string since it is a categorical variable. Let's fix this.

In [70]:
# changing 'esi' data type to be an integer
cleandemodf['esi'] = cleandemodf['esi'].astype(str)

In [71]:
# checking to see that it has worked
cleandemodf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 557992 entries, 1 to 560486
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   dep_name       557992 non-null  object 
 1   esi            557992 non-null  object 
 2   age            557992 non-null  float64
 3   gender         557992 non-null  object 
 4   ethnicity      557992 non-null  object 
 5   race           557992 non-null  object 
 6   lang           557992 non-null  object 
 7   religion       557992 non-null  object 
 8   maritalstatus  557992 non-null  object 
 9   employstatus   557992 non-null  object 
dtypes: float64(1), object(9)
memory usage: 46.8+ MB


This all looks good, let's save the clean data as a parquet file and we are ready to do some EDA.

In [72]:
# saving the data as a parquet file
cleandemodf.to_parquet('../../data/cleaned/demodata.parquet')