# Data Science Major Project to classify whether a person makes over 50k in a year.

### Importing the required libraries

In [102]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

In [2]:
data = pd.read_csv("adult_final.csv")

In [28]:
data

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30164,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
30165,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
30166,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
30167,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K


In [13]:
workclass_count = data['Workclass'].value_counts()
education_count = data['Education'].value_counts()
marital_count = data['marital_status'].value_counts()
occupation_count = data['occupation'].value_counts()
relationship_count = data['relationship'].value_counts()
race_count = data['race'].value_counts()
sex_count = data['sex'].value_counts()
country_count = data['native_country'].value_counts()

In [14]:
print(workclass_count)
print(Education_count)
print(marital_count)
print(occupation_count)
print(relationship_count)
print(race_count)
print(sex_count)
print(country_count)

 Private             22286
 Self-emp-not-inc     2499
 Local-gov            2067
 State-gov            1279
 Self-emp-inc         1074
 Federal-gov           943
 Without-pay            14
Name: Workclass, dtype: int64
 HS-grad         9840
 Some-college    6678
 Bachelors       5044
 Masters         1627
 Assoc-voc       1307
 11th            1048
 Assoc-acdm      1008
 10th             820
 7th-8th          557
 Prof-school      542
 9th              455
 12th             377
 Doctorate        375
 5th-6th          288
 1st-4th          151
 Preschool         45
Name: Education, dtype: int64
 Married-civ-spouse       14065
 Never-married             9726
 Divorced                  4214
 Separated                  939
 Widowed                    827
 Married-spouse-absent      370
 Married-AF-spouse           21
Name: marital_status, dtype: int64
 Prof-specialty       4038
 Craft-repair         4030
 Exec-managerial      3992
 Adm-clerical         3721
 Sales                3584
 Othe

In [8]:
# to see how many labels are there in columns

for col in data.columns:
    print(col, ': ', len(data[col].unique()), 'labels')

Age :  73 labels
Workclass :  8 labels
Fnlwgt :  20264 labels
Education :  17 labels
education_num :  17 labels
marital_status :  8 labels
occupation :  15 labels
relationship :  7 labels
race :  6 labels
sex :  3 labels
capital_gain :  119 labels
capital_loss :  91 labels
hours_per_week :  95 labels
native_country :  42 labels
income :  3 labels


In [9]:
pd.get_dummies(data, drop_first=True).shape

(30169, 97)

In [20]:
total = data.shape[0]
threshold = total*0.003
print('minimum threshold is: ' +str(threshold))

minimum threshold is: 90.507


In [21]:
obj_columns = list(data.select_dtypes(include=['object']).columns)

In [23]:
obj_columns

['Workclass',
 'Education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'income']

In [24]:
obj_columns.remove('income')

In [25]:
df1 = data.apply(lambda x:x.mask(x.map(x.value_counts())< threshold, 'Rare') if x.name in obj_columns else x)

In [27]:
df1

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30164,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
30165,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
30166,58.0,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
30167,22.0,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K


In [29]:
df1['native_country'].value_counts()

 United-States    27504
Rare               1224
 Mexico             610
 Philippines        188
 Germany            128
 Puerto-Rico        109
 Canada             107
 El-Salvador        100
 India              100
 Cuba                92
Name: native_country, dtype: int64

In [52]:
df1.isnull().sum()

Age               7
Workclass         7
Fnlwgt            7
Education         7
education_num     7
marital_status    7
occupation        7
relationship      7
race              7
sex               7
capital_gain      7
capital_loss      7
hours_per_week    7
native_country    7
income            7
dtype: int64

In [53]:
df1[df1['Age'].isna()]

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
4913,,,,,,,,,,,,,,,
10023,,,,,,,,,,,,,,,
13679,,,,,,,,,,,,,,,
18848,,,,,,,,,,,,,,,
21535,,,,,,,,,,,,,,,
29932,,,,,,,,,,,,,,,
29940,,,,,,,,,,,,,,,


In [65]:
df2 = df1.drop(df1.index[4913])
df2 = df1.drop(df1.index[10023])
df2 = df1.drop(df1.index[13679])
df2 = df1.drop(df1.index[18848])
df2 = df1.drop(df1.index[21535])
df2 = df1.drop(df1.index[29932])
df2 = df1.drop(df1.index[29940])

In [67]:
df2.isnull().sum()

Age               6
Workclass         6
Fnlwgt            6
Education         6
education_num     6
marital_status    6
occupation        6
relationship      6
race              6
sex               6
capital_gain      6
capital_loss      6
hours_per_week    6
native_country    6
income            6
dtype: int64

In [69]:
df2[df2['Age'].isna()]

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
4913,,,,,,,,,,,,,,,
10023,,,,,,,,,,,,,,,
13679,,,,,,,,,,,,,,,
18848,,,,,,,,,,,,,,,
21535,,,,,,,,,,,,,,,
29932,,,,,,,,,,,,,,,


In [70]:
df3 = df2.drop(df2.index[29932])

In [71]:
df3[df3['Age'].isna()]

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
4913,,,,,,,,,,,,,,,
10023,,,,,,,,,,,,,,,
13679,,,,,,,,,,,,,,,
18848,,,,,,,,,,,,,,,
21535,,,,,,,,,,,,,,,


In [72]:
df4 = df3.drop(df3.index[21535])

In [73]:
df5 = df4.drop(df4.index[18848])

In [74]:
df6 = df5.drop(df5.index[13679])

In [75]:
df7 = df6.drop(df6.index[10023])

In [76]:
df8 = df7.drop(df7.index[4913])

In [77]:
df8.isnull().sum()

Age               0
Workclass         0
Fnlwgt            0
Education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [78]:
df_encoded = pd.get_dummies(data=df8, columns=obj_columns)

In [79]:
df_encoded.head()

Unnamed: 0,Age,Fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,...,native_country_ Canada,native_country_ Cuba,native_country_ El-Salvador,native_country_ Germany,native_country_ India,native_country_ Mexico,native_country_ Philippines,native_country_ Puerto-Rico,native_country_ United-States,native_country_Rare
0,39.0,77516.0,13.0,2174.0,0.0,40.0,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,50.0,83311.0,13.0,0.0,0.0,13.0,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,38.0,215646.0,9.0,0.0,0.0,40.0,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,53.0,234721.0,7.0,0.0,0.0,40.0,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,28.0,338409.0,13.0,0.0,0.0,40.0,<=50K,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [80]:
df_encoded.isnull().sum()

Age                              0
Fnlwgt                           0
education_num                    0
capital_gain                     0
capital_loss                     0
                                ..
native_country_ Mexico           0
native_country_ Philippines      0
native_country_ Puerto-Rico      0
native_country_ United-States    0
native_country_Rare              0
Length: 74, dtype: int64

In [81]:
X_encoded = df_encoded
y = X_encoded['income']

In [82]:
X_encoded = X_encoded.drop(['income'], axis=1)

In [83]:
le = LabelEncoder()

In [84]:
le.fit(y)

LabelEncoder()

In [85]:
le_names = dict(zip(le.transform(le.classes_), le.classes_))
print(le_names)

{0: ' <=50K', 1: ' >50K'}


In [86]:
y_encoded = le.transform(y)

In [87]:
y_encoded

array([0, 0, 0, ..., 0, 0, 1])

In [88]:
X_train,X_test,y_train,y_test=train_test_split(X_encoded,y_encoded,test_size=0.2,shuffle=True)

In [89]:
model_1=DecisionTreeClassifier()

In [90]:
model_1.fit(X_train,y_train)

DecisionTreeClassifier()

In [91]:
model1_predict = model_1.predict(X_test)

In [94]:
confusion_matrix(model1_predict,y_test)

array([[3915,  553],
       [ 635,  930]], dtype=int64)

In [95]:
classification_report(model1_predict,y_test)

'              precision    recall  f1-score   support\n\n           0       0.86      0.88      0.87      4468\n           1       0.63      0.59      0.61      1565\n\n    accuracy                           0.80      6033\n   macro avg       0.74      0.74      0.74      6033\nweighted avg       0.80      0.80      0.80      6033\n'

In [96]:
print(accuracy_score(model1_predict,y_test))

0.8030830432620587


In [97]:
model_2=RandomForestClassifier()
model_2.fit(X_train,y_train)

RandomForestClassifier()

In [98]:
model2_predict=model_2.predict(X_test)

In [99]:
confusion_matrix(model2_predict,y_test)

array([[4191,  569],
       [ 359,  914]], dtype=int64)

In [100]:
classification_report(model2_predict,y_test)

'              precision    recall  f1-score   support\n\n           0       0.92      0.88      0.90      4760\n           1       0.62      0.72      0.66      1273\n\n    accuracy                           0.85      6033\n   macro avg       0.77      0.80      0.78      6033\nweighted avg       0.86      0.85      0.85      6033\n'

In [101]:
print(accuracy_score(model2_predict,y_test))

0.8461793469252444


In [103]:
model_3=KNeighborsClassifier(n_neighbors=2)
model_3.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=2)

In [104]:
model3_predict=model_3.predict(X_test)

In [105]:
confusion_matrix(model3_predict,y_test)

array([[4285, 1078],
       [ 265,  405]], dtype=int64)

In [106]:
classification_report(model3_predict,y_test)

'              precision    recall  f1-score   support\n\n           0       0.94      0.80      0.86      5363\n           1       0.27      0.60      0.38       670\n\n    accuracy                           0.78      6033\n   macro avg       0.61      0.70      0.62      6033\nweighted avg       0.87      0.78      0.81      6033\n'

In [107]:
print(accuracy_score(model3_predict,y_test))

0.7773910160782364


In [108]:
model_4=LogisticRegression()
model_4.fit(X_train,y_train)

LogisticRegression()

In [109]:
model4_predict=model_4.predict(X_test)

In [110]:
confusion_matrix(model4_predict,y_test)

array([[4394, 1092],
       [ 156,  391]], dtype=int64)

In [111]:
classification_report(model4_predict,y_test)

'              precision    recall  f1-score   support\n\n           0       0.97      0.80      0.88      5486\n           1       0.26      0.71      0.39       547\n\n    accuracy                           0.79      6033\n   macro avg       0.61      0.76      0.63      6033\nweighted avg       0.90      0.79      0.83      6033\n'

In [112]:
print(accuracy_score(model4_predict,y_test))

0.7931377424167081
