In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor,  VotingRegressor, StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from scipy.stats import norm
import warnings
%matplotlib notebook

In [2]:
df = pd.read_csv('Adult Train.csv')
df

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
df[df['native-country'] == ' United-States']

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
5,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
df_us = df[df['native-country'] == ' United-States']
df_us

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
5,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
df_us = df_us.drop(columns=['capital-loss', 'capital-gain', 'native-country'])
df_us

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,hours-per-week,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,<=50K
5,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,40,<=50K
...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,<=50K
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,>50K
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,<=50K
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,<=50K


In [6]:
df_us = df_us[df_us['Occupation'] != ' ?']
df_us

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,hours-per-week,Income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,<=50K
5,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,40,<=50K
...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,<=50K
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,>50K
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,<=50K
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,<=50K


In [7]:
df_us.describe()

Unnamed: 0,Age,hours-per-week
count,27504.0,27504.0
mean,38.50429,40.970986
std,13.184357,12.04112
min,17.0,1.0
25%,28.0,40.0
50%,37.0,40.0
75%,47.0,45.0
max,90.0,99.0


In [8]:
df_us.Occupation.unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Sales', ' Farming-fishing',
       ' Machine-op-inspct', ' Other-service', ' Transport-moving',
       ' Tech-support', ' Craft-repair', ' Protective-serv',
       ' Armed-Forces', ' Priv-house-serv'], dtype=object)

In [9]:
df_us.Income.unique()

array([' <=50K', ' >50K'], dtype=object)

In [10]:
df['native-country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [11]:
df_us['Occupation'].value_counts()

 Exec-managerial      3735
 Prof-specialty       3693
 Craft-repair         3685
 Adm-clerical         3449
 Sales                3364
 Other-service        2777
 Machine-op-inspct    1687
 Transport-moving     1491
 Handlers-cleaners    1189
 Farming-fishing       879
 Tech-support          850
 Protective-serv       606
 Priv-house-serv        90
 Armed-Forces            9
Name: Occupation, dtype: int64

In [12]:
df.isnull().sum()

Age               0
Workclass         0
Education         0
Marital Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
Income            0
dtype: int64

In [13]:
plt.figure(figsize=(8,18))
sns.countplot(y = 'Age', data = df_us)

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='count', ylabel='Age'>

In [14]:
le = LabelEncoder()

In [15]:
 # Age	Workclass	Education	Marital Status	Occupation	Relationship	Race	Sex
df_us['Workclass_label'] = le.fit_transform(df_us['Workclass'])
df_us['Education_label'] = le.fit_transform(df_us['Education'])
df_us['Marital Status_label'] = le.fit_transform(df_us['Marital Status'])
df_us['Occupation_label'] = le.fit_transform(df_us['Occupation'])
df_us['Relationship_label'] = le.fit_transform(df_us['Relationship'])
df_us['Race_label'] = le.fit_transform(df_us['Race'])
df_us['Sex_label'] = le.fit_transform(df_us['Sex'])
df_us['Income_label'] = le.fit_transform(df_us['Income'])
df_us

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_us['Workclass_label'] = le.fit_transform(df_us['Workclass'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_us['Education_label'] = le.fit_transform(df_us['Education'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_us['Marital Status_label'] = le.fit_transform(df_us['Marital Status'])
A v

Unnamed: 0,Age,Workclass,Education,Marital Status,Occupation,Relationship,Race,Sex,hours-per-week,Income,Workclass_label,Education_label,Marital Status_label,Occupation_label,Relationship_label,Race_label,Sex_label,Income_label
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,<=50K,5,9,4,0,1,4,1,0
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,<=50K,4,9,2,3,0,4,1,0
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,<=50K,2,11,0,5,1,4,1,0
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,<=50K,2,1,2,5,0,2,1,0
5,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,40,<=50K,2,12,2,3,5,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,38,<=50K,2,7,2,12,5,4,0,0
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,>50K,2,11,2,6,0,4,1,1
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,40,<=50K,2,11,6,0,4,4,0,0
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,20,<=50K,2,11,4,0,3,4,1,0


In [16]:
 lookup_Workclass_name = dict(zip(df_us.Workclass_label.unique(), df_us.Workclass.unique()))
lookup_Education_name = dict(zip(df_us.Education_label.unique(), df_us.Education.unique()))
lookup_Marital_Status_name = dict(zip(df_us['Marital Status_label'].unique(), df_us['Marital Status'].unique())) 
lookup_Occupation_name = dict(zip(df_us.Occupation_label.unique(), df_us.Occupation.unique()))
lookup_Relationship_name = dict(zip(df_us.Relationship_label.unique(), df_us.Relationship.unique()))
lookup_Race_name = dict(zip(df_us.Race_label.unique(), df_us.Race.unique())) 
lookup_Sex_name = dict(zip(df_us.Sex_label.unique(), df_us.Sex.unique()))
lookup_Income_name = dict(zip(df_us.Income_label.unique(), df_us.Income.unique()))
lookup_Income_name

{0: ' <=50K', 1: ' >50K'}

In [17]:
columns_to_drop = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Income']

In [18]:
df_us = df_us.drop(columns=columns_to_drop)
df_us

Unnamed: 0,Age,hours-per-week,Workclass_label,Education_label,Marital Status_label,Occupation_label,Relationship_label,Race_label,Sex_label,Income_label
0,39,40,5,9,4,0,1,4,1,0
1,50,13,4,9,2,3,0,4,1,0
2,38,40,2,11,0,5,1,4,1,0
3,53,40,2,1,2,5,0,2,1,0
5,37,40,2,12,2,3,5,4,0,0
...,...,...,...,...,...,...,...,...,...,...
32556,27,38,2,7,2,12,5,4,0,0
32557,40,40,2,11,2,6,0,4,1,1
32558,58,40,2,11,6,0,4,4,0,0
32559,22,20,2,11,4,0,3,4,1,0


In [None]:
plt.figure()
sns.pairplot(df_us)

In [19]:
X = df_us.drop(columns=['Income_label']).values
X

array([[39, 40,  5, ...,  1,  4,  1],
       [50, 13,  4, ...,  0,  4,  1],
       [38, 40,  2, ...,  1,  4,  1],
       ...,
       [58, 40,  2, ...,  4,  4,  0],
       [22, 20,  2, ...,  3,  4,  1],
       [52, 40,  3, ...,  5,  4,  0]], dtype=int64)

In [20]:
y = df_us[['Income_label']].values
y

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state = 0)

## USING KNEIGHBORS REGRESSOR

In [35]:
knc = KNeighborsClassifier(n_neighbors = 5)

In [37]:
knc.fit(X_train, y_train.ravel())

KNeighborsClassifier()

In [45]:
knc_pred = knc.predict(X_test)

In [33]:
print('R-squared test score: {:.3f}'
     .format(knr.score(X_test, y_test)))

R-squared test score: 0.787


In [34]:
print('R-squared test score: {:.3f}'
     .format(knr.score(X_train, y_train)))

R-squared test score: 0.849


In [None]:
print('Confusion matrix test score: {:.3f}'
     .format(knr.score(X_test, y_test)))

In [39]:
cv_scores = cross_val_score(knc,X_train, y_train.ravel())

print('Cross-validation scores (3-fold):', cv_scores)
print('Mean cross-validation score (3-fold): {:.3f}'
     .format(np.mean(cv_scores)))

Cross-validation scores (3-fold): [0.76999515 0.77847794 0.76635967 0.78278788 0.77333333]
Mean cross-validation score (3-fold): 0.774


In [40]:

# Negative class (0) is most frequent
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
# Therefore the dummy 'most_frequent' classifier always predicts class 0
y_dummy_predictions = dummy_majority.predict(X_test)

y_dummy_predictions

array([0, 0, 0, ..., 0, 0, 0])

In [41]:
dummy_majority.score(X_test, y_test)

0.7427283304246655

In [43]:
y_majority_predicted = dummy_majority.predict(X_test)
dum_confusion = confusion_matrix(y_test, y_majority_predicted)

print('Most frequent class (dummy classifier)\n', dum_confusion)

Most frequent class (dummy classifier)
 [[5107    0]
 [1769    0]]


In [46]:
knc_confusion = confusion_matrix(y_test, knc_pred)

print('KNN Classifier result \n', knc_confusion)

KNN Classifier result 
 [[4441  666]
 [ 802  967]]


In [47]:

# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, knc_pred)))
print('Precision: {:.2f}'.format(precision_score(y_test, knc_pred)))
print('Recall: {:.2f}'.format(recall_score(y_test, knc_pred)))
print('F1: {:.2f}'.format(f1_score(y_test, knc_pred)))

Accuracy: 0.79
Precision: 0.59
Recall: 0.55
F1: 0.57


### Support Vector Machine Classification

In [55]:
svm = SVC(C=10).fit(X_train, y_train.ravel())
svm_predicted = svm.predict(X_test)
svm_confusion = confusion_matrix(y_test, svm_predicted)

print('Support vector machine classifier (C=10)\n', svm_confusion)
svm.score(X_test, y_test)

Support vector machine classifier (C=10)
 [[4885  222]
 [1373  396]]


0.7680337405468296

In [63]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = SVC(kernel = 'rbf', gamma= 10.00, C=100).fit(X_train_scaled, y_train.ravel())
print('Breast cancer dataset (normalized with MinMax scaling)')
print('RBF-kernel SVC (with MinMax scaling) training set accuracy: {:.2f}'
     .format(clf.score(X_train_scaled, y_train)))
print('RBF-kernel SVC (with MinMax scaling) test set accuracy: {:.2f}'
     .format(clf.score(X_test_scaled, y_test)))

Breast cancer dataset (normalized with MinMax scaling)
RBF-kernel SVC (with MinMax scaling) training set accuracy: 0.89
RBF-kernel SVC (with MinMax scaling) test set accuracy: 0.80


In [64]:
svm_pred_sc = clf.predict(X_test)
confusion_matrix(y_test, svm_pred_sc)

array([[5107,    0],
       [1769,    0]], dtype=int64)

## Cross validation

In [22]:
models_list = [KNeighborsClassifier(), SVC()]

In [23]:
model_hyperparameters = {
    
    'KNN_hyp' : {
        'n_neighbors' : [3,5,10,15,20]
    },
    'svc_hyp' : {
        'C' : [5,10,20,50,100],
        'gamma': [0.10, 1.00,10.00,20.00]
    }
#     ,
#     'random_forest_hyp' : {
#         'n_estimators' : [10, 20, 50, 100]
#     }
}

In [24]:
model_keys= list(model_hyperparameters.keys())
print(model_keys)

['KNN_hyp', 'svc_hyp']


In [25]:
def model_selection(list_of_models,hyperparametrs_dictionary):
    
    result = []
    i = 0
    
    for model in list_of_models:
        key = model_keys[i]
        params = hyperparametrs_dictionary[key]
        i+=1
        
        print(model)
        print(params)
        print('---------------------------------------------')
        
        classifier = GridSearchCV(model, params, cv=5)
        
        # fitting the data to classifier
        
        classifier.fit(X,y.ravel())
        result.append({
            'model used': model,
            'highest score' : classifier.best_score_,
            'best hyperparameters': classifier.best_params_
        })
        
    result_dataframe = pd.DataFrame(result, columns=['model used', 'highest score', 'best hyperparameters'])

    return result_dataframe

In [None]:
model_selection(models_list, model_hyperparameters)

KNeighborsClassifier()
{'n_neighbors': [3, 5, 10, 15, 20]}
---------------------------------------------
SVC()
{'C': [5, 10, 20, 50, 100], 'gamma': [0.1, 1.0, 10.0, 20.0]}
---------------------------------------------
