In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('../data/adult.csv')
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
#Check if the dataset is balanced or not!
data['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [5]:
#Assessing the whole data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [6]:
#Checking unique entries in each column
data.nunique()

age                   74
workclass              9
fnlwgt             28523
education             16
educational-num       16
marital-status         7
occupation            15
relationship           6
race                   5
gender                 2
capital-gain         123
capital-loss          99
hours-per-week        96
native-country        42
income                 2
dtype: int64

In [7]:
# it seems like education and education.num are the same so we will remove the non numeric one 
data = data.drop(columns=['education'])

In [8]:
# look for missing values 
data.isnull().sum()

age                0
workclass          0
fnlwgt             0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [21]:
data

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [23]:
#there are some cells with a '?' inside them 
#/n which means the value is missing but it doen't show in the isnull() funciton 
data = data.replace('?', np.NaN)


In [24]:
data.isnull().sum()

age                   0
workclass          2799
fnlwgt                0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [26]:
# Replacing missing values with most frequent value of each column
cols = ['workclass', 'occupation', 'native-country']

for col in cols:
    most_frequent = data[col].value_counts().sort_values(ascending=False).index[0]
    data[col].fillna(most_frequent, inplace=True)
    print('All the missing values in column', col, 'are replaced with', most_frequent)

All the missing values in column workclass are replaced with Private
All the missing values in column occupation are replaced with Prof-specialty
All the missing values in column native-country are replaced with United-States


In [27]:
# now we are converting the two categorical income types into numeric 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['income'] = le.fit_transform(data['income'])
to_label = dict(zip(le.classes_, le.transform(le.classes_)))
to_class = dict(zip(le.transform(le.classes_), le.classes_))
print(to_label)

{'<=50K': 0, '>50K': 1}


In [28]:
# Ordinaly encoding the other categorical values 
from sklearn.preprocessing import OrdinalEncoder

categorical_cols = data.select_dtypes(include = "object").columns

enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

data[categorical_cols] = enc.fit_transform(data[categorical_cols])

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              48842 non-null  int64  
 1   workclass        48842 non-null  float64
 2   fnlwgt           48842 non-null  int64  
 3   educational-num  48842 non-null  int64  
 4   marital-status   48842 non-null  float64
 5   occupation       48842 non-null  float64
 6   relationship     48842 non-null  float64
 7   race             48842 non-null  float64
 8   gender           48842 non-null  float64
 9   capital-gain     48842 non-null  int64  
 10  capital-loss     48842 non-null  int64  
 11  hours-per-week   48842 non-null  int64  
 12  native-country   48842 non-null  float64
 13  income           48842 non-null  int64  
dtypes: float64(7), int64(7)
memory usage: 5.2 MB


In [31]:
X = data.drop(columns=['income'])
y = data.income

In [33]:
# Splitting the data
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=80, random_state=5)