In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()


In [25]:
train_df = pd.read_csv("census-income.data.csv") #Loading Census data

#Handling Missing Values

#df.info()

#Missing values in census-income.data.csv

#So we move on...

test_df = pd.read_csv("census-income.test.csv")

# train_df.head()

In [26]:

train_df.shape

(32561, 15)

In [27]:
# train_df.info()

In [28]:
# number of null values 

train_df.isnull().sum()

age                 0
workclass        1836
fnlwgt              0
education           0
educationnum        0
maritalstatus       0
occupation       1843
relationship        0
race                0
sex                 0
capitalgain         0
capitalloss         0
hoursperweek        0
nativecountr      583
label               0
dtype: int64

In [29]:
# dropping the null values

train_df = train_df.dropna()
test_df = test_df.dropna()

In [30]:
train_df.isnull().sum()

age              0
workclass        0
fnlwgt           0
education        0
educationnum     0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
nativecountr     0
label            0
dtype: int64

Dummy Encoding to Transform categorical data into numeric

In [31]:
workclass = pd.get_dummies(train_df.workclass, drop_first=True)
education = pd.get_dummies(train_df.education, drop_first=True)
marital_status = pd.get_dummies(train_df.maritalstatus, drop_first=True)
occupation = pd.get_dummies(train_df.occupation, drop_first=True)
relationship = pd.get_dummies(train_df.relationship, drop_first=True)
race = pd.get_dummies(train_df.race, drop_first=True)
sex = pd.get_dummies(train_df.sex, drop_first=True)
native_countr = pd.get_dummies(train_df.nativecountr, drop_first=True)

# test data
workclass_t = pd.get_dummies(test_df.workclass, drop_first=True)
education_t = pd.get_dummies(test_df.education, drop_first=True)
marital_status_t = pd.get_dummies(test_df.maritalstatus, drop_first=True)
occupation_t = pd.get_dummies(test_df.occupation, drop_first=True)
relationship_t = pd.get_dummies(test_df.relationship, drop_first=True)
race_t = pd.get_dummies(test_df.race, drop_first=True)
sex_t = pd.get_dummies(test_df.sex, drop_first=True)
native_countr_t = pd.get_dummies(test_df.nativecountr, drop_first=True)


Concatenating the dataframes together

In [32]:
new_data = pd.concat([train_df.age,train_df.fnlwgt,train_df.educationnum,train_df.capitalgain,train_df.capitalloss,train_df.hoursperweek, workclass,education,marital_status,occupation,relationship,race,sex,native_countr], axis=1)

unique_columns = new_data.loc[:, ~new_data.columns.duplicated()]

new_data = unique_columns

# test data 
new_data_t = pd.concat([test_df.age,test_df.fnlwgt,test_df.educationnum,test_df.capitalgain,test_df.capitalloss,test_df.hoursperweek, workclass_t,education_t,marital_status_t,occupation_t,relationship_t,race_t,sex_t,native_countr_t], axis=1)

unique_columns_t = new_data_t.loc[:, ~new_data_t.columns.duplicated()]

new_data_t = unique_columns_t

# new_data_t.head(100)

Creating separate target variable

In [33]:
Y_test = test_df.label
# Y.head(100)
# Y_train.unique()

Y_test = Y_test.map({'<=50K.':0.0, '>50K.':1.0}).astype('float64')
Y_test

0        0.0
1        0.0
2        1.0
3        1.0
5        0.0
        ... 
16275    0.0
16276    0.0
16278    0.0
16279    0.0
16280    1.0
Name: label, Length: 15060, dtype: float64

In [34]:
Y_train = train_df.label
# Y.head(100)
# Y_train.unique()

Y_train = Y_train.map({'<=50K':0.0, '>50K':1.0}).astype('float64')
# Y.head(100)
X_train = new_data

# test
Y_test = test_df.label
# Y_test.unique()

Y_test = Y_test.map({'<=50K.':0.0, '>50K.':1.0}).astype('float64')
X_test = new_data_t
Y_test


0        0.0
1        0.0
2        1.0
3        1.0
5        0.0
        ... 
16275    0.0
16276    0.0
16278    0.0
16279    0.0
16280    1.0
Name: label, Length: 15060, dtype: float64

In [35]:
X_train = X_train[X_test.columns]

Training Random Forest Classification on Training Data

In [36]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators= 10000, criterion= 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)

Predicting Test Results

Confusion Matrix for Accuracy

In [37]:
Y_test

0        0.0
1        0.0
2        1.0
3        1.0
5        0.0
        ... 
16275    0.0
16276    0.0
16278    0.0
16279    0.0
16280    1.0
Name: label, Length: 15060, dtype: float64

In [38]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

In [39]:
Y_pred = classifier.predict(X_test)

In [40]:
np.where(pd.isnull(Y_test)==True)

(array([], dtype=int64),)

In [41]:
cm = confusion_matrix(Y_pred,Y_test)
print(cm)

[[10482  1411]
 [  878  2289]]


In [42]:
accuracy_score(Y_test, Y_pred)

0.84800796812749

In [43]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90     11360
         1.0       0.72      0.62      0.67      3700

    accuracy                           0.85     15060
   macro avg       0.80      0.77      0.78     15060
weighted avg       0.84      0.85      0.84     15060



Implementing gini criteria and different number of trees

In [44]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators= 15, criterion= 'gini', random_state = 0)
classifier.fit(X_train, Y_train)

In [45]:
accuracy_score(Y_test, Y_pred)

0.84800796812749