In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('learningSet.txt')
data

Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,RECINHSE,...,TARGET_D,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,CLUSTER2,GEOCODE2
0,8901,GRI,0,IL,61081,,,3712,0,,...,0.0,0,L,4,E,X,X,X,39.0,C
1,9401,BOA,1,CA,91326,,,5202,0,,...,0.0,0,L,2,G,X,X,X,1.0,A
2,9001,AMH,1,NC,27017,,,0,0,,...,0.0,1,L,4,E,X,X,X,60.0,C
3,8701,BRY,0,CA,95953,,,2801,0,,...,0.0,1,L,4,E,X,X,X,41.0,C
4,8601,,0,FL,33176,,,2001,0,X,...,0.0,1,L,2,F,X,X,X,26.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,ASE,1,AK,99504,,,0,0,,...,0.0,0,L,1,G,X,X,X,12.0,C
95408,9601,DCD,1,TX,77379,,,5001,0,,...,0.0,1,L,1,F,X,X,X,2.0,A
95409,9501,MBC,1,MI,48910,,,3801,0,,...,0.0,1,L,3,E,X,X,X,34.0,B
95410,8601,PRV,0,CA,91320,,,4005,0,X,...,18.0,1,L,4,F,X,X,X,11.0,A


In [3]:
# Sparcity
# It would be nice to have a df where I have the column name and then the % missing rows. How can I do that?

# First we check the #of missing rows
nulls_percent_df = data.isna().sum()
nulls_percent_df

# Then we calculate the %
nulls_percent_df = data.isna().sum()/len(data)
nulls_percent_df

# We put it into a dataframe
nulls_percent_df = pd.DataFrame(data.isna().sum()/len(data))
nulls_percent_df

# We reset the index because I have OCD
nulls_percent_df = pd.DataFrame(data.isna().sum()/len(data)).reset_index()
nulls_percent_df

# Lets be more concrete
nulls_percent_df.columns = ['column_name', 'nulls_percentage']
nulls_percent_df

# Lets be picky
nulls_percent_df.sort_values(by=['nulls_percentage'], ascending = False)



Unnamed: 0,column_name,nulls_percentage
414,RDATE_5,0.999906
436,RAMNT_5,0.999906
412,RDATE_3,0.997464
434,RAMNT_3,0.997464
413,RDATE_4,0.997055
...,...,...
168,ETHC3,0.000000
167,ETHC2,0.000000
166,ETHC1,0.000000
165,HHD12,0.000000


In [4]:
# Question to students: What can we do when we have missing values?

# First we will start by removing some of the columns that have a high percentage of null values - setting a threshold

threshold = 0.25

condition = nulls_percent_df['nulls_percentage'] > threshold
columns_above_threshold = nulls_percent_df[condition]
columns_above_threshold

# How many are they?
len(columns_above_threshold['column_name']) 

# Lets put this columns name into a list - Trust me, it's easier
drop_columns_list = list(columns_above_threshold['column_name'])
# adding the two columns to be deleted as per exercise
drop_columns_list.extend(['OSOURCE', 'ZIP'])

In [5]:
data

Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,RECINHSE,...,TARGET_D,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,CLUSTER2,GEOCODE2
0,8901,GRI,0,IL,61081,,,3712,0,,...,0.0,0,L,4,E,X,X,X,39.0,C
1,9401,BOA,1,CA,91326,,,5202,0,,...,0.0,0,L,2,G,X,X,X,1.0,A
2,9001,AMH,1,NC,27017,,,0,0,,...,0.0,1,L,4,E,X,X,X,60.0,C
3,8701,BRY,0,CA,95953,,,2801,0,,...,0.0,1,L,4,E,X,X,X,41.0,C
4,8601,,0,FL,33176,,,2001,0,X,...,0.0,1,L,2,F,X,X,X,26.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,ASE,1,AK,99504,,,0,0,,...,0.0,0,L,1,G,X,X,X,12.0,C
95408,9601,DCD,1,TX,77379,,,5001,0,,...,0.0,1,L,1,F,X,X,X,2.0,A
95409,9501,MBC,1,MI,48910,,,3801,0,,...,0.0,1,L,3,E,X,X,X,34.0,B
95410,8601,PRV,0,CA,91320,,,4005,0,X,...,18.0,1,L,4,F,X,X,X,11.0,A


In [6]:
#  to remove the selected columns with too much null value
data = data.drop(columns=drop_columns_list)
data

Unnamed: 0,ODATEDW,TCODE,STATE,MAILCODE,PVASTATE,DOB,NOEXCH,RECINHSE,RECP3,RECPGVG,...,TARGET_D,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,CLUSTER2,GEOCODE2
0,8901,0,IL,,,3712,0,,,,...,0.0,0,L,4,E,X,X,X,39.0,C
1,9401,1,CA,,,5202,0,,,,...,0.0,0,L,2,G,X,X,X,1.0,A
2,9001,1,NC,,,0,0,,,,...,0.0,1,L,4,E,X,X,X,60.0,C
3,8701,0,CA,,,2801,0,,,,...,0.0,1,L,4,E,X,X,X,41.0,C
4,8601,0,FL,,,2001,0,X,X,,...,0.0,1,L,2,F,X,X,X,26.0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,9601,1,AK,,,0,0,,,,...,0.0,0,L,1,G,X,X,X,12.0,C
95408,9601,1,TX,,,5001,0,,,,...,0.0,1,L,1,F,X,X,X,2.0,A
95409,9501,1,MI,,,3801,0,,X,,...,0.0,1,L,3,E,X,X,X,34.0,B
95410,8601,0,CA,,,4005,0,X,,,...,18.0,1,L,4,F,X,X,X,11.0,A


In [7]:
# Count the number of time a value appear in column Gender

print(data['GENDER'].value_counts())
data['GENDER'] = data['GENDER'].fillna('F')

GENDER
F    51277
M    39094
      2957
U     1715
J      365
C        2
A        2
Name: count, dtype: int64


In [8]:
# checking Null Value

data['GENDER'].unique()

array(['F', 'M', ' ', 'C', 'U', 'J', 'A'], dtype=object)

In [9]:
# replace value by other if it's not masculin
data['GENDER'].replace([' ', 'C', 'U', 'J', 'A'], 'Other', inplace=True)


In [10]:
# checking that there is 

data['GENDER'].unique()


array(['F', 'M', 'Other'], dtype=object)

In [11]:
data['GENDER']

0        F
1        M
2        M
3        F
4        F
        ..
95407    M
95408    M
95409    M
95410    F
95411    F
Name: GENDER, Length: 95412, dtype: object