In [1]:
import pandas as pd
import matplotlib as plt

In [2]:
census_data = pd.read_csv(filepath_or_buffer='final_data.csv')
census_data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income_level,workclass_Federal-gov,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,0,0,39,77516.0,13,2174.0,0.0,40.0,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,1,50,83311.0,13,0.0,0.0,13.0,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,2,38,215646.0,9,0.0,0.0,40.0,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,3,53,234721.0,7,0.0,0.0,40.0,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,4,28,338409.0,13,0.0,0.0,40.0,<=50K,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
target = census_data.pop('income_level')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
X_train, X_test, Y_train, Y_test = train_test_split(census_data, target, test_size = 0.25)
X_train.shape, Y_train.shape

((33916, 90), (33916,))

In [5]:
def classification_metrics(true_values, predicted_values):
    print("accuracy: "+ str(accuracy_score(true_values, predicted_values)))
    print("precision is: "+ str(precision_score(true_values, predicted_values, average='macro')))
    print("Recall is: "+ str(recall_score(true_values, predicted_values, average='macro')))
    

In [6]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(solver='lbfgs')
LR.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
LR_predicted = LR.predict(X_test)
print("Logistic regression model evaluation")
classification_metrics(true_values=Y_test, predicted_values=LR_predicted)

Logistic regression model evaluation
accuracy: 0.792145763311516
precision is: 0.7661371988201855
Recall is: 0.6178069508648346


In [8]:
from sklearn.svm import LinearSVC
linearsvc = LinearSVC()
linearsvc.fit(X_train, Y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [9]:
linearsvc_predicted = linearsvc.predict(X_test)
print("Linear svm model evaluation")
classification_metrics(true_values=Y_test, predicted_values=linearsvc_predicted)

Linear svm model evaluation
accuracy: 0.7766672563240757
precision is: 0.8510808108492194
Recall is: 0.557607269703441


In [10]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(penalty='l2', loss='modified_huber')
sgd.fit(X_train, Y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [11]:
sgd_pedicted = sgd.predict(X_test)
print("SGD model evaluation")
classification_metrics(true_values=Y_test, predicted_values=sgd_pedicted)

SGD model evaluation
accuracy: 0.7614540951707058
precision is: 0.6775941573866692
Recall is: 0.5679898239808522


In [12]:
from sklearn.neighbors import KNeighborsClassifier
knclassifier = KNeighborsClassifier(weights='distance', algorithm='ball_tree', n_neighbors=20)
knclassifier.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='distance')

In [13]:
kn_predicted = knclassifier.predict(X_test)
print("K nearest Neighbours model evaluation")
classification_metrics(true_values=Y_test, predicted_values=kn_predicted)

K nearest Neighbours model evaluation
accuracy: 0.7768441535467893
precision is: 0.7693029185817999
Recall is: 0.5718063516631179


In [14]:
from sklearn.naive_bayes import GaussianNB
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [15]:
gnb_predicted = gnb_classifier.predict(X_test)
print("Naive Bayes model evaluation")
classification_metrics(true_values=Y_test, predicted_values=gnb_predicted)

Naive Bayes model evaluation
accuracy: 0.792145763311516
precision is: 0.7429601973233386
Recall is: 0.6369336861919399


In [16]:
from sklearn.naive_bayes import ComplementNB
cmp_classifier = ComplementNB()
cmp_classifier.fit(X_train, Y_train)

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

In [17]:
cmp_predicted = cmp_classifier.predict(X_test)
print("Naive Bayes model evaluation")
classification_metrics(true_values=Y_test, predicted_values=cmp_predicted)

Naive Bayes model evaluation
accuracy: 0.7771979479922165
precision is: 0.7207884591505933
Recall is: 0.5983270446301179


In [18]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [19]:
dtree_predicted = dtree.predict(X_test)
print("Decision Tree model evaluation")
classification_metrics(true_values=Y_test, predicted_values=dtree_predicted)

Decision Tree model evaluation
accuracy: 0.8101892800283036
precision is: 0.7475412751561428
Recall is: 0.7522356699077275


In [20]:
from sklearn.tree import ExtraTreeClassifier
extra_tree = ExtraTreeClassifier()
extra_tree.fit(X_train, Y_train)

ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, random_state=None,
                    splitter='random')

In [21]:
extratree_predicted = extra_tree.predict(X_test)
print("Extra Decision Tree model evaluation")
classification_metrics(true_values=Y_test, predicted_values=extratree_predicted)

Extra Decision Tree model evaluation
accuracy: 0.7875464355209624
precision is: 0.7170708028268038
Recall is: 0.7155343773767415


In [22]:
from sklearn.ensemble import AdaBoostClassifier
ada_classifier = AdaBoostClassifier(algorithm='SAMME.R')
ada_classifier.fit(X_train, Y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [23]:
adaclassifier_predicted = ada_classifier.predict(X_test)
print("Ada Boost model evaluation")
classification_metrics(true_values=Y_test, predicted_values=adaclassifier_predicted)

Ada Boost model evaluation
accuracy: 0.8610472315584645
precision is: 0.8287004914715235
Recall is: 0.7840628374803797


In [24]:
from sklearn.ensemble import GradientBoostingClassifier
gradboost_classifier = GradientBoostingClassifier(loss='exponential')
gradboost_classifier.fit(X_train, Y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='exponential', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [25]:
gradboostclassifier_predicted = gradboost_classifier.predict(X_test)
print("Gradient Boosting model evaluation")
classification_metrics(true_values=Y_test, predicted_values=gradboostclassifier_predicted)

Gradient Boosting model evaluation
accuracy: 0.8670617371307271
precision is: 0.8382922751577941
Recall is: 0.7915967892948179


In [26]:
from sklearn.ensemble import RandomForestClassifier
random_classifier = RandomForestClassifier(n_estimators = 100)
random_classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
randforestclassifier_predicted = random_classifier.predict(X_test)
print("Random Forest model evaluation")
classification_metrics(true_values=Y_test, predicted_values=randforestclassifier_predicted)

Random Forest model evaluation
accuracy: 0.8606934371130374
precision is: 0.8275556199824667
Recall is: 0.7845307894534668
