# Predicting Income based on Census Data

Predicting whether individuals' income is greater than 50k a year [1] vs. less than or equal to 50k a year [0], based on their census data, as provided by the US Census Bureau (1994). 

by Hana Keiningham

### Load Data

In [1]:
import pandas as pd, numpy as np

In [2]:
df = pd.read_csv('preprocessed_census_dataset.csv', index_col = 0)

### Inspect data 

In [4]:
df.head()

Unnamed: 0,Age,WorkClass_ Federal-gov,WorkClass_ Local-gov,WorkClass_ Never-worked,WorkClass_ Private,WorkClass_ Self-emp-inc,WorkClass_ Self-emp-not-inc,WorkClass_ State-gov,WorkClass_ Without-pay,fnlwgt,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ Vietnam,native-country_ Yugoslavia,Class
0,0.03067,0,0,0,0,0,0,1,0,-1.063594,...,0,0,0,0,0,0,0,0,0,0
1,0.837096,0,0,0,0,0,1,0,0,-1.008692,...,0,0,0,0,0,0,0,0,0,0
2,-0.042641,0,0,0,1,0,0,0,0,0.245075,...,0,0,0,0,0,0,0,0,0,0
3,1.057031,0,0,0,1,0,0,0,0,0.425795,...,0,0,0,0,0,0,0,0,0,0
4,-0.775756,0,0,0,1,0,0,0,0,1.408154,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df.describe()

Unnamed: 0,Age,WorkClass_ Federal-gov,WorkClass_ Local-gov,WorkClass_ Never-worked,WorkClass_ Private,WorkClass_ Self-emp-inc,WorkClass_ Self-emp-not-inc,WorkClass_ State-gov,WorkClass_ Without-pay,fnlwgt,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ Vietnam,native-country_ Yugoslavia,Class
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,...,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,0.004541,0.029872,0.06779,0.000307,0.737173,0.035727,0.086135,0.042545,0.00045,-0.001082,...,0.001372,0.003767,0.00043,0.00258,0.001515,0.000676,0.000553,0.001966,0.000471,0.239282
std,1.005137,0.170235,0.251388,0.017522,0.440174,0.185612,0.280566,0.201832,0.021219,1.000512,...,0.037012,0.061263,0.020731,0.050726,0.038895,0.025985,0.023506,0.044291,0.021695,0.426649
min,-1.582182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.681605,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.775756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.6843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.115953,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.110221,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.690473,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.453469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.769554,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.32233,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Split into X and y 

In [6]:
y = df.iloc[:,-1]

In [7]:
X = df.iloc[:, 0:-1]

### Split into Training and Testing

And sanity check to make sure that both train and test data contain the same features

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
len(X_train.columns) == len(X_test.columns)

True

# Individual Algorithms

Write functions that will generate test predictions for each individual algorithms to be used in the final ensemble model.

In [16]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

Random Forest

In [37]:
def rforest(X_train, y_train, X_test):
    
    rforest = BaggingClassifier(base_estimator = RandomForestClassifier())
    rforest.fit(X_train, y_train)
    y_pred = rforest.predict(X_test)
    
    return y_pred

K-Nearest Neighbors

In [38]:
def knn(X_train, y_train, X_test):
    
    knn = BaggingClassifier(base_estimator = KNeighborsClassifier())
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    return y_pred

Naive Bayes

In [39]:
def nb(X_train, y_train, X_test):
    
    nb = BaggingClassifier(base_estimator = GaussianNB())
    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return y_pred


Logistic Regression

In [12]:
def lr(X_train, y_train, X_test):

    lr = BaggingClassifier(base_estimator = LogisticRegression())
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

Support Vector Machines

In [41]:
def svm(X_train, y_train, X_test):
    svm = BaggingClassifier(base_estimator = SVC())
    svm.fit(X_train,y_train)    
    y_pred = svm.predict(X_test)
    
    return y_pred

# Ensemble Function

Final decision is arrived at via a majorit vote. We chose an odd number of algorithms so that we wouldn't have to deal with the possiblity of a tie, thus obviating the need for a tie-breaker. For example if the output for instance x is [0,0,1,0,1] the function below will output [0]. Conversely if the output for instance x is [1,1,1,0,0] the output of the function below will be [1].


In [2]:
from scipy import stats

In [50]:
def ensemble(X_train, y_train, X_test):
    
    """Takes in training feature and target as well testing features
    Fits and trains model and spits out predicted y for each algorithm
    Does ensemble majority vote with 5 algorithms
    and spits out predicted y"""
    
    r_y = rforest(X_train, y_train, X_test)
    k_y = knn(X_train, y_train, X_test)
    l_y = lr(X_train, y_train, X_test)
    n_y = nb(X_train, y_train, X_test)
    s_y = svm(X_train, y_train, X_test)
    
    
    final_y = []
    
    for i in range(len(X_test)):
        
        # majority vote is the final vote
        # we use mode function to return the most common value
        final_y.append(stats.mode([r_y[i], k_y[i], l_y[i], n_y[i], s_y[i]])[0][0])
        
    return final_y

# Run Prediction

In [43]:
trial_run = ensemble(X_train, y_train, X_test)

In [17]:
accuracy_score(hm, y_test)

0.85165653306861888

# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

Confusion Matrix

In [44]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_run, y_test).ravel()[i])

tn 11308
fp 1411
fn 896
tp 2503


In [45]:
accuracy_score(trial_run, y_test)

0.85686809777888073

In [46]:
precision_score(trial_run, y_test)

0.63949923352069493

In [47]:
recall_score(trial_run, y_test)

0.73639305678140632

In [48]:
f1_score(trial_run, y_test)

0.68453439081088463