In [8]:
## Loading dataset
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
train = pd.read_csv('train_final.csv')
train.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income>50K
0,53,Self-emp-not-inc,93449,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,1
1,33,Self-emp-not-inc,123424,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,47,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,40,Private,114580,HS-grad,9,Divorced,Craft-repair,Other-relative,White,Female,0,0,40,Vietnam,0
4,39,Private,115618,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,0


In [9]:
# some rows have missing values - '?'
train.iloc[26]

age                          19
workclass                     ?
fnlwgt                   141418
education          Some-college
education.num                10
marital.status    Never-married
occupation                    ?
relationship          Own-child
race                      White
sex                        Male
capital.gain                  0
capital.loss                  0
hours.per.week               15
native.country    United-States
income>50K                    0
Name: 26, dtype: object

In [10]:
# datatypes of columns
train.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income>50K         int64
dtype: object

In [11]:
#replacing '?' with np.nan
new_train = train.replace(to_replace='?',value = np.NaN)
new_train.iloc[26]

age                          19
workclass                   NaN
fnlwgt                   141418
education          Some-college
education.num                10
marital.status    Never-married
occupation                  NaN
relationship          Own-child
race                      White
sex                        Male
capital.gain                  0
capital.loss                  0
hours.per.week               15
native.country    United-States
income>50K                    0
Name: 26, dtype: object

In [12]:
new_train.isna().sum()

age                  0
workclass         1437
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1442
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     427
income>50K           0
dtype: int64

In [13]:
#replacing workclass missing values with mode of respective columns
workclass_mode = new_train['workclass'].mode()
occupation_mode = new_train['occupation'].mode()
country_mode = new_train['native.country'].mode()
new_train['workclass'].fillna(workclass_mode[0],inplace = True)
new_train['occupation'].fillna(occupation_mode[0],inplace = True)
new_train['native.country'].fillna(country_mode[0],inplace = True)
new_train.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income>50K        0
dtype: int64

In [14]:
#checking for duplicate rows
new_train.duplicated().sum()

14

In [15]:
#removing duplicate rows
new_train = new_train.drop_duplicates(keep='first')
new_train.duplicated().sum()

0

In [16]:
profile = ProfileReport(new_train, title="Pandas Profiling Report")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/29 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

From the above analysis, I am not selecting 'capital.gain' and 'capital.loss' because these columns mainly contains value '0'. Also, 'education.num' and 'education' is showcasing same information and hence are redundant. That's why I am using 'education.num' column only for training models.

In [17]:
new_train.drop(columns = ['education','capital.gain','capital.loss'],inplace=True)
new_train.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,income>50K
0,53,Self-emp-not-inc,93449,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,40,India,1
1,33,Self-emp-not-inc,123424,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,1
2,47,Private,144844,9,Married-civ-spouse,Craft-repair,Husband,White,Male,40,United-States,0
3,40,Private,114580,9,Divorced,Craft-repair,Other-relative,White,Female,40,Vietnam,0
4,39,Private,115618,9,Married-civ-spouse,Transport-moving,Husband,White,Male,50,United-States,0


In [18]:
#encoding categorical data
from sklearn.preprocessing import LabelEncoder
categorical = new_train.columns[new_train.dtypes == 'object']
labelencoder = LabelEncoder()
encoded_data = new_train
for category in categorical:
    encoded_data[category] = labelencoder.fit_transform(encoded_data[category])
encoded_data[categorical]

Unnamed: 0,workclass,marital.status,occupation,relationship,race,sex,native.country
0,5,2,9,0,1,1,17
1,5,2,3,0,4,1,37
2,3,2,2,0,4,1,37
3,3,0,2,2,4,0,38
4,3,2,13,0,4,1,37
...,...,...,...,...,...,...,...
24995,3,4,0,1,4,0,37
24996,1,2,0,5,4,0,37
24997,3,2,13,0,4,1,29
24998,3,0,9,1,4,1,37


In [19]:
#scaling data
from sklearn.preprocessing import StandardScaler

X_train = encoded_data.iloc[:,:-1]
y_train = encoded_data['income>50K']

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country
0,1.042934,1.701047,-0.901103,1.912406,-0.408372,0.722745,-0.899098,-3.123573,0.703565,-0.042426,-3.211861
1,-0.415237,1.701047,-0.61785,1.135718,-0.408372,-0.794904,-0.899098,0.392998,0.703565,-0.042426,0.255455
2,0.605483,-0.091205,-0.415438,-0.417657,-0.408372,-1.047846,-0.899098,0.392998,0.703565,-0.042426,0.255455
3,0.095123,-0.091205,-0.701423,-0.417657,-1.729397,-1.047846,0.351834,0.392998,-1.421333,-0.042426,0.428821
4,0.022215,-0.091205,-0.691614,-0.417657,-0.408372,1.734512,-0.899098,0.392998,0.703565,0.762957,0.255455


In [23]:
#test data preprocessing
test = pd.read_csv('test_final.csv')
new_test = test.replace(to_replace='?',value = np.NaN)

workclass_mode = new_test['workclass'].mode()
occupation_mode = new_test['occupation'].mode()
country_mode = new_test['native.country'].mode()
new_test['workclass'].fillna(workclass_mode[0],inplace = True)
new_test['occupation'].fillna(occupation_mode[0],inplace = True)
new_test['native.country'].fillna(country_mode[0],inplace = True)

encoded_data = new_test
for category in categorical:
    encoded_data[category] = labelencoder.fit_transform(encoded_data[category])
encoded_data[categorical]

encoded_data.drop(columns = ['education','capital.gain','capital.loss'],inplace=True)

X_test = encoded_data.iloc[:,1:]
X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns = X_train.columns)
X_test_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country
0,-0.408092,1.720564,0.299718,-0.42184,-0.412366,-1.044794,-0.902746,0.391832,0.705017,-0.025233,0.265393
1,2.146469,-0.088192,-1.531043,-0.42184,2.252211,0.711639,-0.28002,0.391832,-1.418406,-2.289301,0.265393
2,-0.335105,-0.088192,-0.825431,-0.42184,-0.412366,-0.292037,-0.902746,0.391832,0.705017,-0.025233,0.265393
3,1.343607,-0.088192,-0.827263,-2.76098,-0.412366,1.715315,-0.902746,0.391832,0.705017,-0.025233,0.265393
4,0.68672,-0.088192,-0.357412,-0.031984,0.919923,-1.546632,-0.28002,0.391832,-1.418406,-0.025233,0.265393


In [25]:
#writing data to csv file
import csv
def write_output(predictions = [],filename='output.csv'):
    output = list(enumerate(predictions,start=1))
    fields = ['ID','Prediction']
    with open(filename, 'w',newline = '') as csvfile: 
        # creating a csv writer object 
        csvwriter = csv.writer(csvfile,delimiter=',') 
        # writing the fields 
        csvwriter.writerow(fields) 
        # writing the data rows 
        for i in output:
            csvwriter.writerow(list(i))

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

## Logistic Regression - default parameters - not scaled
lr = LogisticRegression()
lr.fit(X_train,y_train)
predictions = lr.predict(X_test)
print("Training Score- data not scaled: ",lr.score(X_train,y_train))

## Logistic Regression - default parameters - data scaled
lr_scaled = LogisticRegression()
lr.fit(X_train_scaled,y_train)
predictions_scaled = lr.predict(X_test_scaled)
print("Training Score- data scaled: ",lr.score(X_train_scaled,y_train))


write_output(predictions_scaled,'logistic_scaled.csv')
write_output(predictions,'logistic.csv')

Training Score- data not scaled:  0.7593852557432162
Training Score- data scaled:  0.8076122628672057


In [27]:
#logistic regression
lr_scaled = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled,y_train)
predictions_scaled1 = lr.predict(X_test_scaled)
print("Training Score- data scaled: ",lr.score(X_train_scaled,y_train))

Training Score- data scaled:  0.8076122628672057


In [28]:
#Decision Tree -
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
prediction_dt = dt.predict(X_test)
print("Decision tree - Training Score: ",dt.score(X_train,y_train))

#Decision Tree -
dt = DecisionTreeClassifier()
dt.fit(X_train_scaled,y_train)
prediction_dt_scaled = dt.predict(X_test_scaled)
print("Decision tree - Training Score - data scaled: ",dt.score(X_train_scaled,y_train))
write_output(prediction_dt_scaled,'decisiontree_scaled.csv')
write_output(prediction_dt,'decisiontree.csv')

Decision tree - Training Score:  0.9998799327623469
Decision tree - Training Score - data scaled:  0.9998799327623469


In [29]:
#Random Forest -
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
prediction_rf = rf.predict(X_test)
print("Random Forest - Training Score: ",rf.score(X_train,y_train))

#Random Forest - Scaled data
rf_scaled = RandomForestClassifier()
rf_scaled.fit(X_train_scaled,y_train)
prediction_rf_scaled = rf_scaled.predict(X_test_scaled)
print("Random Forest - Training Score - data scaled: ",rf_scaled.score(X_train_scaled,y_train))
write_output(prediction_rf_scaled,'randomforest_scaled.csv')
write_output(prediction_rf,'randomforest.csv')

Random Forest - Training Score:  0.9997998879372448
Random Forest - Training Score - data scaled:  0.9998799327623469


In [30]:
#Adaboost -
ad = AdaBoostClassifier()
ad.fit(X_train,y_train)
prediction_ad = ad.predict(X_test)
print("Adaboost - Training Score: ",ad.score(X_train,y_train))

#Adaboost - Scaled data
ad_scaled = AdaBoostClassifier()
ad_scaled.fit(X_train_scaled,y_train)
prediction_ad_scaled = ad_scaled.predict(X_test_scaled)
print("Adaboost - Training Score - data scaled: ",ad_scaled.score(X_train_scaled,y_train))
write_output(prediction_ad_scaled,'adaboost_scaled.csv')
write_output(prediction_ad,'adaboost.csv')

Adaboost - Training Score:  0.8372688705675179
Adaboost - Training Score - data scaled:  0.8372688705675179


In [31]:
#XGboost - 
xgbmodel = xgb.XGBClassifier(objective = "binary:logistic",random_state=20)
xgbmodel.fit(X_train,y_train)
prediction_xg = xgbmodel.predict(X_test)
print("XGboost - Training Score: ",xgbmodel.score(X_train,y_train))

#XGboost - scaled data
xgbmodel_scaled = xgb.XGBClassifier(objective = "binary:logistic",random_state=20)
xgbmodel_scaled.fit(X_train_scaled,y_train)
prediction_xg_scaled = xgbmodel_scaled.predict(X_test_scaled)
print("XGboost - Training Score - data scaled: ",xgbmodel_scaled.score(X_train_scaled,y_train))
write_output(prediction_xg_scaled,'xgboost_scaled.csv')
write_output(prediction_xg,'xgboost.csv')

XGboost - Training Score:  0.8886176258704874
XGboost - Training Score - data scaled:  0.8886176258704874
