# import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')

# Matplotlib forms basis for visualization in Python
import matplotlib.pyplot as plt

# We will use the Seaborn library
import seaborn as sns
sns.set()

# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'

#  reading a dataset

In [2]:
#downloading the data, setting the headers, removing spaces
data_link = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
test_data_link = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
data = pd.read_csv(data_link,names=["age","workclass","fnlwgt","education","education_num","martial_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income"],skipinitialspace=True,na_values='?')


# specify inputs and outputs and convert data to one hot coding for classifications

In [3]:
inputs = data.drop('income',axis='columns')# the inputs are all columns in the dataset except income
inputs = inputs.drop('education',axis='columns')
inputs = inputs.drop('fnlwgt',axis='columns')
target = data['income']#the target we want to predict is income column

inputs=pd.get_dummies(inputs, sparse=True) 
inputs=inputs.values
target=target.values
# print(inputs)
# encoding the features in order to be used for prediction
# le_age = LabelEncoder()
# le_workclass = LabelEncoder()
# le_education = LabelEncoder()
# le_fnlwgt = LabelEncoder()
# le_education.num = LabelEncoder()
# le_maritalstatus = LabelEncoder()
# le_occupation = LabelEncoder()
# le_relationship = LabelEncoder()
# le_race = LabelEncoder()
# le_sex = LabelEncoder()
# le_capitalgain = LabelEncoder()
# le_capitalloss = LabelEncoder()
# le_hourperweek = LabelEncoder()
# le_nativecountry = LabelEncoder()


# inputs['age'] = le_age.fit_transform(inputs['age'])
# inputs['workclass'] = le_workclass.fit_transform(inputs['workclass'])
# inputs['education'] = le_education.fit_transform(inputs['education'])
# inputs['fnlwgt'] = le_fnlwgt.fit_transform(inputs['fnlwgt'])
# inputs['education.num'] = le_education.num.fit_transform(inputs['education.num'])
# inputs['marital.status'] = le_maritalstatus.fit_transform(inputs['marital.status'])
# inputs['occupation'] = le_occupation.fit_transform(inputs['occupation'])
# inputs['relationship'] = le_relationship.fit_transform(inputs['relationship'])
# inputs['race'] = le_race.fit_transform(inputs['race'])
# inputs['sex'] = le_sex.fit_transform(inputs['sex'])
# inputs['capital.gain'] = le_capitalgain.fit_transform(inputs['capital.gain'])
# inputs['capital.loss'] = le_capitalloss.fit_transform(inputs['capital.loss'])
# inputs['hours.per.week'] = le_hourperweek.fit_transform(inputs['hours.per.week'])
# inputs['native.country'] = le_nativecountry.fit_transform(inputs['native.country'])

inputs

array([[   39,    13,  2174, ...,     1,     0,     0],
       [   50,    13,     0, ...,     1,     0,     0],
       [   38,     9,     0, ...,     1,     0,     0],
       ...,
       [   58,     9,     0, ...,     1,     0,     0],
       [   22,     9,     0, ...,     1,     0,     0],
       [   52,     9, 15024, ...,     1,     0,     0]], dtype=int64)

In [4]:
#split the data so that 10% of the data is used for testing
x_train,x_test,y_train,y_test=train_test_split(inputs,target,test_size=0.1,random_state=1)
x_train

array([[26,  9,  0, ...,  1,  0,  0],
       [22, 10,  0, ...,  1,  0,  0],
       [36,  9,  0, ...,  1,  0,  0],
       ...,
       [27, 13,  0, ...,  0,  0,  0],
       [59,  9,  0, ...,  1,  0,  0],
       [33, 13,  0, ...,  1,  0,  0]], dtype=int64)

### for every algorithm we will train for the data train and test for the data test and showing the accuracy of the prediction

In [None]:
#build the decision tree
model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)
# predict for x_test 
y_pred=model.predict(x_test)
#calculate the accuracy of prediction by comparing the results
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.81639545594105

In [None]:
# knn algorithm
from sklearn.neighbors import KNeighborsClassifier  
classifier = KNeighborsClassifier(n_neighbors=5)  
classifier.fit(x_train, y_train)  
y_pred = classifier.predict(x_test) 
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.841264967761744

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(x_train,y_train)
y_pred=logistic.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.8483266809947805

In [None]:
import sys, os
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV

clf = svm.SVC(kernel='linear')
clf=clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)
accuracy

# cross validation

In [None]:

from sklearn.model_selection import cross_val_score,KFold

logistic = LogisticRegression()
classifier = KNeighborsClassifier(n_neighbors=5)
model = tree.DecisionTreeClassifier()
clf = svm.SVC(kernel='linear')

scores_svm = cross_val_score(clf, inputs,target, cv=10)

scores_logistic = cross_val_score(logistic, inputs, target, cv=10) 

scores_tree = cross_val_score(model, inputs, target, cv=10)

scores_knn = cross_val_score(classifier, inputs, target, cv=10)


print("Accuracy of logistic regression: %0.2f " % (scores_logistic.mean()))
print("Accuracy of decision tree : %0.2f " % (scores_tree.mean()))
print("Accuracy of knn : %0.2f " % (scores_knn.mean()))
print("Accuracy of sv, : %0.2f " % (scores_svm.mean()))