In [78]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pickle

In [20]:
df = pd.read_csv('/adult.csv')

In [21]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


## Missing data handling

In [22]:
# Replace '?' by NaN (from Numpy)
df = df.replace('?', np.NaN)

In [23]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

Most of the columns have 0 missing values (NaN). Three columns have a high number of missing values. The highest number of missing values being 1843.
The DataFrame has 32561 rows which is much bigger than 1843.
We could do three ways:
- We could drop the workclass, occupation and native.country columns and keep all the rows. 
- We could drop all the rows having missing values (we would lose 1843 rows)
- We could find use a strategy to fill the missing values

We choose to drop all rows having missing values


In [26]:
# We drop all the rows having NaN values and drop the duplicates
print("Original DataFrame shape: " + str(df.shape))
df = df.dropna()
df = df.drop_duplicates()
df.shape
print("Post handling data DataFrame shape: " + str(df.shape))

Original DataFrame shape: (32561, 15)
Post handling data DataFrame shape: (30139, 15)


In [28]:
df.income.value_counts()

<=50K    22633
>50K      7506
Name: income, dtype: int64

In [None]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [52]:
var_cat = ["workclass","education","marital.status","occupation","relationship","race","sex","native.country","income" ]
var_num = ["age","fnlwgt","education.num","capital.gain","capital.loss"]
print('Categorical variables are:',var_cat)
print('Numerical variables are:',var_num)

Categorical variables are: ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']
Numerical variables are: ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss']


Some of the columns being categorical, they can't be used for the modeling as is. They need to be transformed to numerical columns.
There are multiples methods of numerical encoding, we will generate the same code for each value

In [42]:
# For each categorical column, we transform the categories into a code generated by Pandas
for cat_column in var_cat:
  df[cat_column] = pd.Categorical(df[cat_column]).codes

In [48]:
df.income.value_counts()

0    22633
1     7506
Name: income, dtype: int64

All the categorical columns have been transformed to numerical data.<br>
For the income column:<br>
0 -> <=50K
1 -> >50k

In [49]:
target_col = 'income'

In [51]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,2,132870,11,9,6,3,1,4,0,0,4356,18,38,0
3,54,2,140359,5,4,0,6,4,4,0,0,3900,40,38,0
4,41,2,264663,15,10,5,9,3,4,0,0,3900,40,38,0
5,34,2,216864,11,9,0,7,4,4,0,0,3770,45,38,0
6,38,2,150601,0,6,5,0,4,4,1,0,3770,40,38,0


In [53]:
X = df.drop(target_col, axis='columns')
y = df[target_col]

In [54]:
# Spécifier la partie test et la partie train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) #20% des données dans le test_set
print('Train set:', X_train.shape)
print('Test set:', X_test.shape)

Train set: (24111, 14)
Test set: (6028, 14)


## Compare different classification models

### Logistic Regression

In [70]:
# Instanciate the model
model=LogisticRegression()
# Train the model
model.fit(X_train,y_train)
# Make predictions on the test datasets
predictions = model.predict(X_test)
# Get the model accuracy
accuracy_score(y_test,predictions)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7773722627737226

### KNN

In [79]:
# Instanciate the model
model=KNeighborsClassifier()
# Train the model
model.fit(X_train,y_train)
# Make predictions on the test datasets
predictions = model.predict(X_test)
# Get the model accuracy
accuracy_score(y_test,predictions)

0.7561380225613802

### Decision Tree

In [74]:
# Instanciate the model
model = DecisionTreeClassifier()
# Train the model
model.fit(X_train,y_train)
# Make predictions on the test datasets
predictions = model.predict(X_test)
# Get the model accuracy
accuracy_score(y_test,predictions)

0.8015925680159257

### Random Forest

In [77]:
# Instanciate the model
model = RandomForestClassifier()
# Train the model
model.fit(X_train,y_train)
# Make predictions on the test datasets
predictions = model.predict(X_test)
# Get the model accuracy
accuracy_score(y_test,predictions)

0.8465494359654944

In order to keep the code cleaner, we use a for loop and a dictionary to train and evaluate all the models with the same code. It is important not to repeat code when it's possible.

In [85]:
models = {
    "logistic_regression": LogisticRegression(max_iter=3000),
    "knn": KNeighborsClassifier(),
    "decision_tree": DecisionTreeClassifier(),
    "random_forest": RandomForestClassifier()
}

for model_name, model in models.items():
  print("########################")
  print(model_name)
  # Train the model
  model.fit(X_train,y_train)
  # Make predictions on the test datasets
  predictions = model.predict(X_test)
  # Get the model accuracy
  score = accuracy_score(y_test,predictions)
  print(score)
  print("########################\n")

########################
logistic_regression
0.778699402786994
########################

########################
knn
0.7561380225613802
########################

########################
decision_tree
0.79628400796284
########################

########################
random_forest
0.8455540809555409
########################



According to the accuracy score (ratio of correct predicted test values and the actual test values) Random Forest is the model having the highest accuracy (85%).<br>
The comparison has been made with the default hyperparameters. In order to improve the performance they should be chosen and tuned. By finding the best hyperparameters of Random Forest, we should be able to improve the model accuracy.<br>
Besides, using different missing values strategies we mentioned earlier may also affect the model performance.