## 1. Prediction task is to determine whether a person makes over 50K a year.

In [1]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

In [6]:
col_name = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','avg-hours-per-week','native-country','income']
train_set=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',names = col_name)
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,avg-hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
test_set = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", names = col_name)
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,avg-hours-per-week,native-country,income
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [8]:
# removing first row from test data 
test_set.drop(test_set.index[[0]],inplace=True)
print("Testing Data-set:\n",test_set.head())

Testing Data-set:
   age   workclass    fnlwgt      education  education-num  \
1  25     Private  226802.0           11th            7.0   
2  38     Private   89814.0        HS-grad            9.0   
3  28   Local-gov  336951.0     Assoc-acdm           12.0   
4  44     Private  160323.0   Some-college           10.0   
5  18           ?  103497.0   Some-college           10.0   

        marital-status          occupation relationship    race      sex  \
1        Never-married   Machine-op-inspct    Own-child   Black     Male   
2   Married-civ-spouse     Farming-fishing      Husband   White     Male   
3   Married-civ-spouse     Protective-serv      Husband   White     Male   
4   Married-civ-spouse   Machine-op-inspct      Husband   Black     Male   
5        Never-married                   ?    Own-child   White   Female   

   capital-gain  capital-loss  avg-hours-per-week  native-country   income  
1           0.0           0.0                40.0   United-States   <=50K.  
2  

In [9]:
# Converting "income" column '>50K' as 1 and '<=50K' as 0
train_set["income"] = train_set["income"].map({ " <=50K": 0, " >50K": 1 })
test_set["income"] = test_set["income"].map({ " <=50K.": 0, " >50K.": 1 })

In [10]:
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,avg-hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [11]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,avg-hours-per-week,native-country,income
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,0
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,0
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,1
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,1
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,0


In [13]:
train_set.dtypes

age                    int64
workclass             object
fnlwgt                 int64
education             object
education-num          int64
marital-status        object
occupation            object
relationship          object
race                  object
sex                   object
capital-gain           int64
capital-loss           int64
avg-hours-per-week     int64
native-country        object
income                 int64
dtype: object

In [14]:
# Encoding all categorical column of "train_set" and "test_set" data
for x in train_set.columns:
    if train_set[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_set[x].values))
        train_set[x] = lbl.transform(list(train_set[x].values))
        
for x in test_set.columns:
    if test_set[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(test_set[x].values))
        test_set[x] = lbl.transform(list(test_set[x].values))

In [16]:
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,avg-hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [17]:
# Splitting the dataset into 'X' and 'Y'column for Training set and Test set
X_train= train_set.iloc[:, :14].values
y_train= train_set.iloc[:, 14].values
X_test= test_set.iloc[:, :14].values
y_test= test_set.iloc[:, 14].values

In [18]:
# Feature Scaling X data set
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



In [19]:
# Fitting XGBoost classifier to the Training set
XB_classifier = XGBClassifier()
XB_classifier.fit(X_train, y_train)
# Predicting the Test set results from XGBoost classifier
XB_Y_pred = XB_classifier.predict(X_test)
XB_Y_pred

  if diff:


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
XB_cm = confusion_matrix(y_test, XB_Y_pred)
XB_cm

array([[12239,   196],
       [ 2409,  1437]], dtype=int64)

In [21]:
XB_accuracy = metrics.accuracy_score(y_test, XB_Y_pred)
print('XG_BOOST Accuracy: {:.4f}'.format(XB_accuracy))

XG_BOOST Accuracy: 0.8400


In [22]:
#Classification Report for XG-Boost classifier
print(metrics.classification_report(y_test, XB_Y_pred))
print("variance of XG_Boost classifier: {:.4f}".format(XB_classifier.score(X_test,y_test)))

             precision    recall  f1-score   support

          0       0.84      0.98      0.90     12435
          1       0.88      0.37      0.52      3846

avg / total       0.85      0.84      0.81     16281

variance of XG_Boost classifier: 0.8400


  if diff:


In [23]:
# Fitting Random Forest classifier to the Training set
RF_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RF_classifier.fit(X_train, y_train)

# Predicting the Test set results
RF_y_pred = RF_classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
RF_cm = confusion_matrix(y_test, RF_y_pred)
print("Random Forest CONFUSIONMATRIX:\n",RF_cm)
RF_accuracy = metrics.accuracy_score(y_test, RF_y_pred)
print('Random Forest Accuracy: {:.4f}'.format(RF_accuracy))
#Classification Report for Random Forest Classifier
print(metrics.classification_report(y_test, RF_y_pred))
print("variance of Random Forest classifier: %.4f" %RF_classifier.score(X_test,y_test))

Random Forest CONFUSIONMATRIX:
 [[12176   259]
 [ 2376  1470]]
Random Forest Accuracy: 0.8382
             precision    recall  f1-score   support

          0       0.84      0.98      0.90     12435
          1       0.85      0.38      0.53      3846

avg / total       0.84      0.84      0.81     16281

variance of Random Forest classifier: 0.8382


In [24]:
# Fitting Decision Tree Classification to the Training set
DT_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DT_classifier.fit(X_train, y_train)

# Predicting the Test set results
DT_y_pred = DT_classifier.predict(X_test)

# Making the Confusion Matrix
DT_cm = confusion_matrix(y_test, DT_y_pred)
print("Decision Tree CONFUSIONMATRIX:\n",DT_cm)
DT_accuracy = metrics.accuracy_score(y_test, DT_y_pred)
print('Decision Tree Accuracy: {:.4f}'.format(DT_accuracy))
#Classification Report for Decision Tree classifier
print(metrics.classification_report(y_test, DT_y_pred))
print("variance of Decision Tree classifier: %.4f" %DT_classifier.score(X_test,y_test))

Decision Tree CONFUSIONMATRIX:
 [[11743   692]
 [ 2186  1660]]
Decision Tree Accuracy: 0.8232
             precision    recall  f1-score   support

          0       0.84      0.94      0.89     12435
          1       0.71      0.43      0.54      3846

avg / total       0.81      0.82      0.81     16281

variance of Decision Tree classifier: 0.8232


## 2. Which factors are important

Below mentioned two parameters are important to choose the best classification algorithm.
    1. overall classification accuracy
    2. variance in accuracy that determines the stability and consistency of an algorithm.

## 3. Which algorithms are best for this dataset

As per Overall classification accuracy and variance in accuracy "XG BOOST CLASSIFIER" is best for this dataset.