<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

# Import Packages

In [2]:
#import packages.
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline 
np.set_printoptions(suppress=True)
from matplotlib.pyplot import figure

In [3]:
#From Scikit Learn
from sklearn import preprocessing
from sklearn.model_selection  import train_test_split, cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

In [4]:
%pwd

'C:\\Users\\JM025575\\Predictive Models Class'

In [5]:
cd /Users/JM025575/Predictive Models Class/data

C:\Users\JM025575\Predictive Models Class\data


# Load Data

For this assignment I will be using the adult census dataset.  I downloaded the data locally and will now load it in to my notebook.

In [6]:
#Import Using Pandas
adult = pd.read_csv("adult.csv") #,thousands=','
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
adult.shape

(32561, 15)

In [8]:
#Validating that there are no null values
feat_miss = adult.columns[adult.isnull().any()]
print(feat_miss)

Index([], dtype='object')


In [9]:
#Review Column Names
adult.columns.tolist()

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'country',
 'salary']

We will be setting salary as the target variable.  We will do some exploratory analysis on the variables and get the target feature to the first column.

# Identify Target Variable and Move to Target to Column 0

In [10]:
# designate target variable name and move the target variable of 'churn' to my first column for easier use.
targetName = 'salary'
targetSeries = adult[targetName]
#remove target from current location and insert in collum 0
del adult[targetName]
adult.insert(0, targetName, targetSeries)
#reprint dataframe and see target is in position 0
adult.head()

Unnamed: 0,salary,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country
0,<=50K,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,<=50K,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,<=50K,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,<=50K,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,<=50K,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


# Drop Column

In [11]:
#dropping columns I do not believe will contribute to a strong prediction
adult = adult.drop(['country'], axis = 1)

I decided to drop the 'country' column as the large majority of results are either United States or have no information.  Since I intend to transform my categorical variables to dummies, and with the weighting so heavy towards the US, I did not want to create a data set with such a large number of features where I do not believe the country features that would be created would have a strong predictive power.

# Exploratory Data Analysis

In [12]:
#show columns
adult.columns

Index(['salary', 'age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week'],
      dtype='object')

In [13]:
adult.shape

(32561, 14)

In [14]:
#show data types of all features
adult.dtypes

salary            object
age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
dtype: object

In [15]:
#mean of all numerical attributes
adult.mean()

age                   38.581647
fnlwgt            189778.366512
education-num         10.080679
capital-gain        1077.648844
capital-loss          87.303830
hours-per-week        40.437456
dtype: float64

In [16]:
#standard deviation of all numerical attributes
adult.std()

age                   13.640433
fnlwgt            105549.977697
education-num          2.572720
capital-gain        7385.292085
capital-loss         402.960219
hours-per-week        12.347429
dtype: float64

In [17]:
#Salary Distribution
adult.groupby('salary').size()

salary
 <=50K    24720
 >50K      7841
dtype: int64

In [18]:
#Education Distribution
adult.groupby('education').size()

education
 10th              933
 11th             1175
 12th              433
 1st-4th           168
 5th-6th           333
 7th-8th           646
 9th               514
 Assoc-acdm       1067
 Assoc-voc        1382
 Bachelors        5355
 Doctorate         413
 HS-grad         10501
 Masters          1723
 Preschool          51
 Prof-school       576
 Some-college     7291
dtype: int64

In [19]:
#Workclass Distribution
adult.groupby('workclass').size()

workclass
 ?                    1836
 Federal-gov           960
 Local-gov            2093
 Never-worked            7
 Private             22696
 Self-emp-inc         1116
 Self-emp-not-inc     2541
 State-gov            1298
 Without-pay            14
dtype: int64

In [20]:
#Marital Status Distribution
adult.groupby('marital-status').size()

marital-status
 Divorced                  4443
 Married-AF-spouse           23
 Married-civ-spouse       14976
 Married-spouse-absent      418
 Never-married            10683
 Separated                 1025
 Widowed                    993
dtype: int64

In [21]:
#Occupation Distribution
adult.groupby('occupation').size()

occupation
 ?                    1843
 Adm-clerical         3770
 Armed-Forces            9
 Craft-repair         4099
 Exec-managerial      4066
 Farming-fishing       994
 Handlers-cleaners    1370
 Machine-op-inspct    2002
 Other-service        3295
 Priv-house-serv       149
 Prof-specialty       4140
 Protective-serv       649
 Sales                3650
 Tech-support          928
 Transport-moving     1597
dtype: int64

In [22]:
#Race Distribution
adult.groupby('race').size()

race
 Amer-Indian-Eskimo      311
 Asian-Pac-Islander     1039
 Black                  3124
 Other                   271
 White                 27816
dtype: int64

In [23]:
#Gender Distribution
adult.groupby('sex').size()

sex
 Female    10771
 Male      21790
dtype: int64

# Z-Score Normalization

We will go through and run a z-score normalization for all numeric values to ensure no one feature has an undue influence on our models below.

In [24]:
#Transform numeric features to Z-Scores
adult.age=pd.DataFrame((adult.age - adult.age.mean())/adult.age.std())
adult.fnlwgt=pd.DataFrame((adult.fnlwgt - adult.fnlwgt.mean())/adult.fnlwgt.std())
adult['education-num']=pd.DataFrame((adult['education-num'] - adult['education-num'].mean())/adult['education-num'].std())
adult['capital-gain']=pd.DataFrame((adult['capital-gain'] - adult['capital-gain'].mean())/adult['capital-gain'].std())
adult['capital-loss']=pd.DataFrame((adult['capital-loss'] - adult['capital-loss'].mean())/adult['capital-loss'].std())
adult['hours-per-week']=pd.DataFrame((adult['hours-per-week'] - adult['hours-per-week'].mean())/adult['hours-per-week'].std())

In [25]:
adult.head()

Unnamed: 0,salary,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week
0,<=50K,0.03067,State-gov,-1.063594,Bachelors,1.134721,Never-married,Adm-clerical,Not-in-family,White,Male,0.148451,-0.216656,-0.035429
1,<=50K,0.837096,Self-emp-not-inc,-1.008692,Bachelors,1.134721,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.145918,-0.216656,-2.222119
2,<=50K,-0.042641,Private,0.245075,HS-grad,-0.420053,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.145918,-0.216656,-0.035429
3,<=50K,1.057031,Private,0.425795,11th,-1.19744,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.145918,-0.216656,-0.035429
4,<=50K,-0.775756,Private,1.408154,Bachelors,1.134721,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.145918,-0.216656,-0.035429


# Categorical Variables to Dummies

In [26]:
from sklearn import preprocessing
le_dep = preprocessing.LabelEncoder()
#to convert into numbers
adult['salary'] = le_dep.fit_transform(adult['salary'])

In [27]:
# perform data transformation. Creates dummies of any categorical feature and turns them in to their own column with values of 
# 0 or 1.
for col in adult.columns[1:]:
	attName = col
	dType = adult[col].dtype
	missing = pd.isnull(adult[col]).any()
	uniqueCount = len(adult[attName].value_counts(normalize=False))
	# discretize (create dummies)
	if dType == object:
		adult = pd.concat([adult, pd.get_dummies(adult[col], prefix=col)], axis=1)
		del adult[attName]

In [28]:
adult.shape

(32561, 67)

I went from 15 features to 67 features now.

In [29]:
adult.head()

Unnamed: 0,salary,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male
0,0,0.03067,-1.063594,1.134721,0.148451,-0.216656,-0.035429,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,0,0.837096,-1.008692,1.134721,-0.145918,-0.216656,-2.222119,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,0,-0.042641,0.245075,-0.420053,-0.145918,-0.216656,-0.035429,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,0,1.057031,0.425795,-1.19744,-0.145918,-0.216656,-0.035429,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,0,-0.775756,1.408154,1.134721,-0.145918,-0.216656,-0.035429,0,0,0,...,0,0,1,0,0,1,0,0,1,0


# Split Dataset into Train/Test 

I will create a training and test data set that will be used in the models below.  I will use a 33/67 split for the test vs train.

In [30]:
# split dataset into testing and training, creating 4 new objects here. Separating out features from target.  
features_train, features_test, target_train, target_test = train_test_split(
    adult.iloc[:,1:].values, adult.iloc[:,0].values, test_size=0.33, random_state=0)

## Four new train/test files and their shapes. 

In [219]:
print(features_test.shape)
print(features_train.shape)
print(target_test.shape)
print(target_train.shape)

(10746, 66)
(21815, 66)
(10746,)
(21815,)


# KNN Models

I will first run KNN classification models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will only have the n_neighbors parameter changed to 3.  I will continue to adjust my remaining models from there and potentially run a grid search to find my ideal number of neighbors with the goal of maximizing the accuracy.

## KNN Model - 1

In [222]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(features_train, target_train) 
predicted_KNN1 = neigh.predict(features_test)
print(neigh)
# make predictions
print(target_test)
# summarize the fit of the model
print(classification_report(target_test, predicted_KNN1))
print(confusion_matrix(target_test, predicted_KNN1))
print(accuracy_score(target_test,predicted_KNN1))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
[0 0 0 ... 0 1 0]
             precision    recall  f1-score   support

          0       0.88      0.90      0.89      8151
          1       0.65      0.60      0.62      2595

avg / total       0.82      0.83      0.82     10746

[[7314  837]
 [1043 1552]]
0.8250511818351014


## Cross Validate KNN

In [223]:
#verify KNN with Cross Validation
scores_KNN = cross_val_score(neigh, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_KNN)
scores_KNN.mean()  

Cross Validation Score for each K [0.83684693 0.8299725  0.82676444 0.82951421 0.82401467 0.82768103
 0.80513526 0.82714351 0.81843191 0.82522936]


0.825073380815709

## KNN Model - 2

In [224]:
neigh = KNeighborsClassifier(n_neighbors=7, p = 1)
neigh.fit(features_train, target_train) 
predicted_KNN1 = neigh.predict(features_test)
print(neigh)
# make predictions
print(target_test)
# summarize the fit of the model
print(classification_report(target_test, predicted_KNN1))
print(confusion_matrix(target_test, predicted_KNN1))
print(accuracy_score(target_test,predicted_KNN1))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=1,
           weights='uniform')
[0 0 0 ... 0 1 0]
             precision    recall  f1-score   support

          0       0.87      0.91      0.89      8151
          1       0.68      0.59      0.63      2595

avg / total       0.83      0.83      0.83     10746

[[7431  720]
 [1071 1524]]
0.8333333333333334


## Cross Validate KNN

In [225]:
#verify KNN with Cross Validation
scores_KNN = cross_val_score(neigh, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_KNN)
scores_KNN.mean()  

Cross Validation Score for each K [0.84647113 0.84097159 0.83088909 0.83363886 0.83455545 0.82630614
 0.82255846 0.82897753 0.8262265  0.84678899]


0.8337383749072375

## KNN Model - 3

In [226]:
neigh = KNeighborsClassifier(n_neighbors = 15, p = 1, weights = 'distance')
neigh.fit(features_train, target_train) 
predicted_KNN1 = neigh.predict(features_test)
print(neigh)
# make predictions
print(target_test)
# summarize the fit of the model
print(classification_report(target_test, predicted_KNN1))
print(confusion_matrix(target_test, predicted_KNN1))
print(accuracy_score(target_test,predicted_KNN1))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=1,
           weights='distance')
[0 0 0 ... 0 1 0]
             precision    recall  f1-score   support

          0       0.88      0.92      0.89      8151
          1       0.69      0.59      0.64      2595

avg / total       0.83      0.84      0.83     10746

[[7459  692]
 [1059 1536]]
0.8370556486134375


## Cross Validate KNN

In [227]:
#verify KNN with Cross Validation
scores_KNN = cross_val_score(neigh, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_KNN)
scores_KNN.mean()  

Cross Validation Score for each K [0.85472044 0.84280477 0.83455545 0.84372136 0.8395967  0.82951421
 0.83402109 0.82760202 0.83402109 0.84220183]


0.838275895869369

## Grid Search to narrow down n_neighbors

In [331]:
# use a full grid over several parameters
from sklearn.model_selection import GridSearchCV
param_grid = {"n_neighbors": [12, 16, 20]}

# run grid search
grid_search = GridSearchCV(neigh, param_grid=param_grid,n_jobs=-1)
grid_search.fit(features_train, target_train)
print("Best", grid_search.best_params_) 

Best {'n_neighbors': 16}


## KNN Model - 4

In [332]:
neigh = KNeighborsClassifier(n_neighbors = 16, p = 1, weights = 'distance')
neigh.fit(features_train, target_train) 
predicted_KNN1 = neigh.predict(features_test)
print(neigh)
# make predictions
print(target_test)
# summarize the fit of the model
print(classification_report(target_test, predicted_KNN1))
print(confusion_matrix(target_test, predicted_KNN1))
print(accuracy_score(target_test,predicted_KNN1))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=16, p=1,
           weights='distance')
[0 0 0 ... 0 1 0]
             precision    recall  f1-score   support

          0       0.88      0.91      0.89      8151
          1       0.69      0.59      0.64      2595

avg / total       0.83      0.84      0.83     10746

[[7452  699]
 [1052 1543]]
0.8370556486134375


## Cross Validate KNN

In [333]:
#verify KNN with Cross Validation
scores_KNN = cross_val_score(neigh, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_KNN)
scores_KNN.mean()  

Cross Validation Score for each K [0.85059578 0.84326306 0.83638863 0.84509624 0.83501375 0.82768103
 0.83264558 0.83035305 0.83539661 0.84357798]


0.8380011709988473

I ran 4 separate KNN models, all with slightly different parameters tuned.  The first model I only changed the n_neighbors to 3 and got an accuracy score of 0.825.  I cross validated this model 10 times to check for overfitting and stayed within 1% on all models.  The second model I ran I adjusted the n_neighbors to 7 and changed to p=1 to use the manhattan distance versus the euclidean distance.  This time my accuracy score increased to 0.833.  I also cross validated 10 times and saw no overfitting.  My third model I adjusted n_neighbors to 15, kept p=1, and now adjusted my weights= to 'distance' rather than uniform.  This resulted in a slightly better model with an accuracy score of 0.837.  Cross validating again showed a lack of overfitting.  Finally, I ran a grid search to find my best n_neighbors number, which turned out to be 16.  I ran one final model, similar to model 3 but with n_neighbors at 16, and got an accuracy score of 0.837. My final two KNN Models are my best models for this section.

# Decision Tree Models

I will next run Decision Tree classification models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be a basic Decision Tree with no default parameters tuned.  I will continue to adjust my remaining models from there with the goal of maximizing the accuracy.

## Decision Tree - Model 1

In [31]:
from sklearn import tree 
clf_dt = tree.DecisionTreeClassifier() #taking out of box options. 
#Call up the model to see the parameters you can tune (and their default setting)
print(clf_dt)
#Fit clf to the training data
clf_dt = clf_dt.fit(features_train, target_train)
#Predict clf DT model again test data
target_predicted_dt = clf_dt.predict(features_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


### Model Accuracy

In [32]:
print("DT Accuracy Score", accuracy_score(target_test, target_predicted_dt))
print(classification_report(target_test, target_predicted_dt))
print(confusion_matrix(target_test, target_predicted_dt))

DT Accuracy Score 0.8123022520007445
             precision    recall  f1-score   support

          0       0.88      0.88      0.88      8151
          1       0.61      0.61      0.61      2595

avg / total       0.81      0.81      0.81     10746

[[7136 1015]
 [1002 1593]]


### Cross Validate Basic Model

In [230]:
#verify DT with Cross Validation
scores = cross_val_score(clf_dt, features_train, target_train, cv=10)  #it is sampling from each of the data sets, usually sampling 50% out of each
print("Cross Validation Score for each K",scores)
scores.mean()    

Cross Validation Score for each K [0.80843263 0.81897342 0.8203483  0.81576535 0.8111824  0.80522456
 0.80788629 0.82393398 0.80284273 0.81284404]


0.8127433708099001

## Decision Tree - Model 2

In [33]:
clf_dt = tree.DecisionTreeClassifier(criterion = 'entropy', splitter = 'random' ) #changing the criterion to entropy and splitter to random
print(clf_dt)
#Fit clf to the training data
clf_dt = clf_dt.fit(features_train, target_train)
#Predict clf DT model again test data
target_predicted_dt = clf_dt.predict(features_test)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')


### Model Accuracy

In [35]:
print("DT Accuracy Score", accuracy_score(target_test, target_predicted_dt))
print(classification_report(target_test, target_predicted_dt))
print(confusion_matrix(target_test, target_predicted_dt))

DT Accuracy Score 0.8121161362367393
             precision    recall  f1-score   support

          0       0.88      0.87      0.88      8151
          1       0.61      0.62      0.61      2595

avg / total       0.81      0.81      0.81     10746

[[7125 1026]
 [ 993 1602]]


### Cross Validate Basic Model

In [233]:
#verify DT with Cross Validation
scores = cross_val_score(clf_dt, features_train, target_train, cv=10)  #it is sampling from each of the data sets, usually sampling 50% out of each
print("Cross Validation Score for each K",scores)
scores.mean()    

Cross Validation Score for each K [0.82722273 0.80797434 0.80018332 0.81851512 0.80018332 0.81576535
 0.80375974 0.80330124 0.80055021 0.80458716]


0.8082042523143654

## Decision Tree - Model 3

In [36]:
clf_dt = tree.DecisionTreeClassifier(criterion = 'gini', min_samples_split = 5)
print(clf_dt)
#Fit clf to the training data
clf_dt = clf_dt.fit(features_train, target_train)
#Predict clf DT model again test data
target_predicted_dt = clf_dt.predict(features_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


### Model Accuracy

In [37]:
print("DT Accuracy Score", accuracy_score(target_test, target_predicted_dt))
print(classification_report(target_test, target_predicted_dt))
print(confusion_matrix(target_test, target_predicted_dt))

DT Accuracy Score 0.8221663874930206
             precision    recall  f1-score   support

          0       0.88      0.89      0.88      8151
          1       0.64      0.61      0.62      2595

avg / total       0.82      0.82      0.82     10746

[[7262  889]
 [1022 1573]]


### Cross Validate Basic Model

In [266]:
#verify DT with Cross Validation
scores = cross_val_score(clf_dt, features_train, target_train, cv=10)  #it is sampling from each of the data sets, usually sampling 50% out of each
print("Cross Validation Score for each K",scores)
scores.mean()    

Cross Validation Score for each K [0.81759853 0.82309808 0.82676444 0.82172319 0.81714024 0.8111824
 0.82530949 0.82072444 0.81201284 0.82247706]


0.8198030706185142

I ran 3 separate Decision Tree models, all with slightly different parameters tuned.  The first model had no default parameters tuned and got an accuracy score of 0.812.  I cross validated this model 10 times to check for overfitting and stayed within 2% on all models.  The second model I ran I adjusted the criterion to entropy, for information gain, and the splitter to random.  This time my accuracy score slightly increased to 0.812.  I also cross validated 10 times and saw no overfitting.  My final model I adjusted criterion back to gini and set my minimum samples split to 5 for the number of samples required for a split.  This resulted in a slightly better model with an accuracy score of 0.822.  Cross validating again showed a lack of overfitting.  My final Decision Tree is my best model for this section.

# Stochastic Gradient Descent Models

I will next run Stochastic Gradient Descent Models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be a SGD model with no parameters set, so a basic model.  I will continue to adjust my remaining models from there and may run a grid search to find my ideal alpha or other parameters with the goal of maximizing the accuracy.

## SGD Model - 1

In [38]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier()
print(clf_sgd)
#Fit clf_sgd to the training data
clf_sgd = clf_sgd.fit(features_train, target_train)
#Predict clf_sgd model again test data
predicted_sgd = clf_sgd.predict(features_test)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)


### Model Accuracy

In [39]:
print("SGD Accuracy Score", accuracy_score(target_test, predicted_sgd))
print(classification_report(target_test, target_predicted_dt))
print(confusion_matrix(target_test, predicted_sgd))

SGD Accuracy Score 0.8455239158756747
             precision    recall  f1-score   support

          0       0.88      0.89      0.88      8151
          1       0.64      0.61      0.62      2595

avg / total       0.82      0.82      0.82     10746

[[7625  526]
 [1134 1461]]


### Cross Validate Basic Model

In [390]:
#verify SGD with Cross Validation
scores_sgd = cross_val_score(clf_sgd, features_train, target_train, cv=10)  #it is sampling from each of the data sets, usually sampling 50% out of each
print("Cross Validation Score for each K",scores_sgd)
scores_sgd.mean()    

Cross Validation Score for each K [0.81393217 0.83730522 0.81530706 0.80889093 0.80797434 0.82676444
 0.83310408 0.82026593 0.81338835 0.83027523]


0.8207207749233014

## SGD Model - 2

In [40]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(penalty = 'l1', alpha = 0.1, )
print(clf_sgd)
#Fit clf_sgd to the training data
clf_sgd = clf_sgd.fit(features_train, target_train)
#Predict clf_sgd model again test data
predicted_sgd = clf_sgd.predict(features_test)

SGDClassifier(alpha=0.1, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l1', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)


### Model Accuracy

In [41]:
print("SGD Accuracy Score", accuracy_score(target_test, predicted_sgd))
print(classification_report(target_test, target_predicted_dt))
print(confusion_matrix(target_test, predicted_sgd))

SGD Accuracy Score 0.7631676903033687
             precision    recall  f1-score   support

          0       0.88      0.89      0.88      8151
          1       0.64      0.61      0.62      2595

avg / total       0.82      0.82      0.82     10746

[[8151    0]
 [2545   50]]


### Cross Validate Model

In [393]:
#verify SGD with Cross Validation
scores_sgd = cross_val_score(clf_sgd, features_train, target_train, cv=10)  #it is sampling from each of the data sets, usually sampling 50% out of each
print("Cross Validation Score for each K",scores_sgd)
scores_sgd.mean()    

Cross Validation Score for each K [0.76260312 0.76581118 0.75939505 0.76535289 0.76351971 0.763978
 0.7647868  0.76249427 0.75974324 0.75963303]


0.7627317273306977

## SGD Model - 3

In [42]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(penalty = 'l2', alpha = 0.01, )
print(clf_sgd)
#Fit clf_sgd to the training data
clf_sgd = clf_sgd.fit(features_train, target_train)
#Predict clf_sgd model again test data
predicted_sgd = clf_sgd.predict(features_test)

SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)


### Model Accuracy

In [43]:
print("SGD Accuracy Score", accuracy_score(target_test, predicted_sgd))
print(classification_report(target_test, target_predicted_dt))
print(confusion_matrix(target_test, predicted_sgd))

SGD Accuracy Score 0.847571189279732
             precision    recall  f1-score   support

          0       0.88      0.89      0.88      8151
          1       0.64      0.61      0.62      2595

avg / total       0.82      0.82      0.82     10746

[[7663  488]
 [1150 1445]]


### Cross Validate Model

In [402]:
#verify SGD with Cross Validation
scores_sgd = cross_val_score(clf_sgd, features_train, target_train, cv=10)  #it is sampling from each of the data sets, usually sampling 50% out of each
print("Cross Validation Score for each K",scores_sgd)
scores_sgd.mean()    

Cross Validation Score for each K [0.8492209  0.85242896 0.84509624 0.85334555 0.84280477 0.84326306
 0.84640073 0.84594223 0.83768913 0.84311927]


0.8459310848133583

## Grid Search to narrow down alpha

In [403]:
param_grid = {"alpha": [0.01, 0.1, 0.001, 0.00001, 1]}

# run grid search
grid_search = GridSearchCV(clf_sgd, param_grid=param_grid,n_jobs=-1)
grid_search.fit(features_train, target_train)
print("Best", grid_search.best_params_) 

Best {'alpha': 0.001}


## SGD Model - 4

In [44]:
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier(penalty = 'l2', alpha = 0.001, )
print(clf_sgd)
#Fit clf_sgd to the training data
clf_sgd = clf_sgd.fit(features_train, target_train)
#Predict clf_sgd model again test data
predicted_sgd = clf_sgd.predict(features_test)

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)


### Model Accuracy

In [45]:
print("SGD Accuracy Score", accuracy_score(target_test, predicted_sgd))
print(classification_report(target_test, target_predicted_dt))
print(confusion_matrix(target_test, predicted_sgd))

SGD Accuracy Score 0.8456169737576773
             precision    recall  f1-score   support

          0       0.88      0.89      0.88      8151
          1       0.64      0.61      0.62      2595

avg / total       0.82      0.82      0.82     10746

[[7376  775]
 [ 884 1711]]


### Cross Validate Model

In [406]:
#verify SGD with Cross Validation
scores_sgd = cross_val_score(clf_sgd, features_train, target_train, cv=10)  #it is sampling from each of the data sets, usually sampling 50% out of each
print("Cross Validation Score for each K",scores_sgd)
scores_sgd.mean()    

Cross Validation Score for each K [0.85334555 0.85747021 0.84509624 0.85747021 0.84051329 0.84463795
 0.84548372 0.84869326 0.84181568 0.85321101]


0.8487737128635316

I ran 4 separate Stochastic Gradient Descent models, all with slightly different parameters tuned.  The first model I ran had no parameters adjusted from the based model and got an accuracy score of 0.825.  I cross validated this model 10 times to check for overfitting and stayed within 2% on all models.  The second model I ran I adjusted my penalty to l1 and the alpha I raised to 0.1.  This time my accuracy score decreased to 0.763.  I also cross validated 10 times and saw no overfitting.  My third model I adjusted my penalty back to l2 and my alpha down to 0.01.  This resulted in a better model with an accuracy score of 0.848.  Cross validating again showed a lack of overfitting.  Finally, I ran a grid search to find my best alpha, which turned out to be 0.001.  I ran one final model, similar to model 3 but with an alpha of 0.001 and a penalty of l2 and got an accuracy score of 0.851. My fourth SGD model produced the best accuracy score.

# Adaboost Models

I will next run Adaboost models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be an Adaboost model with a Decision Tree with a max depth of 3, and then I will use the SAMME algorithm and n_estimators of 50 for that classifier.  I will continue to adjust my remaining models from there and may run a grid search to find my ideal number of estimators with the goal of maximizing the accuracy.

## Adaboost Model - 1

In [351]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                         algorithm="SAMME",
                         n_estimators=50)
bdt.fit(features_train, target_train)
predicted_bdt=bdt.predict(features_test)
expected = target_test
print("Adaboost Accuracy", accuracy_score(expected,predicted_bdt))
print(classification_report(expected, predicted_bdt,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_bdt))

Adaboost Accuracy 0.8536199516099013
             precision    recall  f1-score   support

         No       0.89      0.92      0.91      8151
        Yes       0.72      0.64      0.68      2595

avg / total       0.85      0.85      0.85     10746

[[7522  629]
 [ 944 1651]]


## Cross Validate Adaboost Model

In [352]:
#verify Adaboost with Cross Validation
scores_ADA = cross_val_score(bdt, features_train, target_train, cv=10) 
print("Cross Validation Score for each K",scores_ADA)
scores_ADA.mean()

Cross Validation Score for each K [0.86480293 0.85609533 0.86021998 0.86388634 0.85013749 0.8492209
 0.86795048 0.85878038 0.84915177 0.85779817]


0.8578043757540297

## Adaboost Model - 2

In [357]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(criterion = 'entropy', max_depth=1),
                         algorithm="SAMME",
                         n_estimators=100)
bdt.fit(features_train, target_train)
predicted_bdt=bdt.predict(features_test)
expected = target_test
print("Adaboost Accuracy", accuracy_score(expected,predicted_bdt))
print(classification_report(expected, predicted_bdt,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_bdt))

Adaboost Accuracy 0.8553880513679508
             precision    recall  f1-score   support

         No       0.88      0.94      0.91      8151
        Yes       0.76      0.58      0.66      2595

avg / total       0.85      0.86      0.85     10746

[[7680  471]
 [1083 1512]]


## Cross Validate Adaboost Model

In [355]:
#verify Adaboost with Cross Validation
scores_ADA = cross_val_score(bdt, features_train, target_train, cv=10) 
print("Cross Validation Score for each K",scores_ADA)
scores_ADA.mean()

Cross Validation Score for each K [0.86251146 0.8583868  0.86434464 0.86296975 0.84830431 0.84692942
 0.85465383 0.85602934 0.8436497  0.85183486]


0.854961411668176

## Adaboost Model - 3

In [362]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(criterion = 'entropy', max_depth=1),
                         algorithm="SAMME",
                         n_estimators=400, learning_rate = 0.5)
bdt.fit(features_train, target_train)
predicted_bdt=bdt.predict(features_test)
expected = target_test
print("Adaboost Accuracy", accuracy_score(expected,predicted_bdt))
print(classification_report(expected, predicted_bdt,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_bdt))

Adaboost Accuracy 0.8560394565419691
             precision    recall  f1-score   support

         No       0.88      0.94      0.91      8151
        Yes       0.76      0.58      0.66      2595

avg / total       0.85      0.86      0.85     10746

[[7683  468]
 [1079 1516]]


## Cross Validate Adaboost Model

In [363]:
#verify Adaboost with Cross Validation
scores_ADA = cross_val_score(bdt, features_train, target_train, cv=10) 
print("Cross Validation Score for each K",scores_ADA)
scores_ADA.mean()

Cross Validation Score for each K [0.86480293 0.85609533 0.86571952 0.8588451  0.85288726 0.84692942
 0.8610729  0.85740486 0.84135718 0.85137615]


0.8556490644927444

## Grid Search to narrow down n_estimators

In [364]:
param_grid = {"n_estimators": [300, 400, 500]}

# run grid search
grid_search = GridSearchCV(bdt, param_grid=param_grid,n_jobs=-1)
grid_search.fit(features_train, target_train)
print("Best", grid_search.best_params_) 

Best {'n_estimators': 500}


## Adaboost Model - 4

In [365]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(criterion = 'entropy', max_depth=1),
                         algorithm="SAMME",
                         n_estimators=500, learning_rate = 0.5)
bdt.fit(features_train, target_train)
predicted_bdt=bdt.predict(features_test)
expected = target_test
print("Adaboost Accuracy", accuracy_score(expected,predicted_bdt))
print(classification_report(expected, predicted_bdt,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_bdt))

Adaboost Accuracy 0.8560394565419691
             precision    recall  f1-score   support

         No       0.88      0.94      0.91      8151
        Yes       0.76      0.59      0.66      2595

avg / total       0.85      0.86      0.85     10746

[[7679  472]
 [1075 1520]]


## Cross Validate Adaboost Model

In [366]:
#verify Adaboost with Cross Validation
scores_ADA = cross_val_score(bdt, features_train, target_train, cv=10) 
print("Cross Validation Score for each K",scores_ADA)
scores_ADA.mean()

Cross Validation Score for each K [0.86434464 0.85747021 0.86159487 0.8588451  0.85517874 0.84830431
 0.85969739 0.85878038 0.84181568 0.85321101]


0.8559242307727647

I ran 4 separate Adaboost models, all with slightly different parameters tuned and all using a DecisionTreeClassifier.  The first model I ran had a max_depth of 3 for the decision tree and then used the algorithm SAMME and n_estimators of 50 for the boosting and got an accuracy score of 0.853.  I cross validated this model 10 times to check for overfitting and stayed within 2% on all models.  The second model I ran I adjusted the criterion on the decision tree to 'entropy' and the max_depth to 1, maintained the same algorithm, and increased my n_estimators to 100.  This time my accuracy score increased to 0.855.  I also cross validated 10 times and saw no overfitting.  My third model I adjusted my n_estimators up to 400 and added a learning_rate of 0.5, different from the default of 1.  This resulted in a slightly better model with an accuracy score of 0.856.  Cross validating again showed a lack of overfitting.  Finally, I ran a grid search to find my best n_estimators number, which turned out to be 500.  I ran one final model, similar to model 3 but with n_estimators at 500, and got an accuracy score of 0.856. My third and fourth Adaboost models produced the same accuracy and precision scores, however Model 4 had a slightly higher recall so Model 4 is my best model.

# Random Forest Models

I will next run Random Forest classification models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be a basic Random Forest model with no default parameters tuned.  I will continue to adjust my remaining models from there and may run a grid search to find my ideal number of estimators with the goal of maximizing the accuracy.

## Random Forest Model - 1

In [267]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
# train random forest model
rf = RandomForestClassifier()
rf.fit(features_train, target_train)
# test random forest model
target_predicted_rf = rf.predict(features_test)
print(accuracy_score(target_test, target_predicted_rf))
target_names = ["No", "Yes"]
print(classification_report(target_test, target_predicted_rf, target_names=target_names))

0.8465475525777033
             precision    recall  f1-score   support

         No       0.88      0.93      0.90      8151
        Yes       0.73      0.58      0.65      2595

avg / total       0.84      0.85      0.84     10746



## Cross Validate Random Forest

In [268]:
#verify Random Forest with Cross Validation
scores_rf = cross_val_score(rf, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_rf)
scores_rf.mean()  

Cross Validation Score for each K [0.85197067 0.85013749 0.84784601 0.84188818 0.84280477 0.85197067
 0.84915177 0.84273269 0.83356259 0.8440367 ]


0.8456101521740209

## Random Forest Model - 2

In [269]:
rf = RandomForestClassifier(n_estimators= 100, criterion = 'entropy')
rf.fit(features_train, target_train)
# test random forest model
target_predicted_rf = rf.predict(features_test)
print(accuracy_score(target_test, target_predicted_rf))
target_names = ["No", "Yes"]
print(classification_report(target_test, target_predicted_rf, target_names=target_names))

0.8534338358458962
             precision    recall  f1-score   support

         No       0.89      0.93      0.91      8151
        Yes       0.73      0.62      0.67      2595

avg / total       0.85      0.85      0.85     10746



## Cross Validate Random Forest

In [270]:
#verify Random Forest with Cross Validation
scores_rf = cross_val_score(rf, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_rf)
scores_rf.mean()  

Cross Validation Score for each K [0.86984418 0.85792851 0.85472044 0.85976169 0.84647113 0.85563703
 0.85740486 0.8436497  0.8523613  0.85917431]


0.8556953145961564

## Random Forest Model - 3

In [271]:
rf = RandomForestClassifier(n_estimators= 500, n_jobs=4,oob_score=True, criterion = 'entropy')
rf.fit(features_train, target_train)
# test random forest model
target_predicted_rf = rf.predict(features_test)
print(accuracy_score(target_test, target_predicted_rf))
target_names = ["No", "Yes"]
print(classification_report(target_test, target_predicted_rf, target_names=target_names))

0.8561325144239718
             precision    recall  f1-score   support

         No       0.89      0.93      0.91      8151
        Yes       0.74      0.63      0.68      2595

avg / total       0.85      0.86      0.85     10746



## Cross Validate Random Forest

In [272]:
#verify Random Forest with Cross Validation
scores_rf = cross_val_score(rf, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_rf)
scores_rf.mean()  

Cross Validation Score for each K [0.86709441 0.86021998 0.85197067 0.85930339 0.84967919 0.8588451
 0.85786337 0.84823475 0.84548372 0.8559633 ]


0.855465788654844

## Grid Search to narrow down n_estimators

In [335]:
# use a full grid over several parameters
from sklearn.model_selection import GridSearchCV
param_grid = {"n_estimators": [450, 500, 550]}

# run grid search
grid_search = GridSearchCV(rf, param_grid=param_grid,n_jobs=-1)
grid_search.fit(features_train, target_train)
print("Best", grid_search.best_params_) 

Best {'n_estimators': 500}


I ran 3 separate Random Forest models, all with slightly different parameters tuned.  The first model had no default parameters tuned and got an accuracy score of 0.846.  I cross validated this model 10 times to check for overfitting and stayed within 2% on all models.  The second model I ran I adjusted the criterion to entropy, for information gain, and the n_estimators to 100, from the default 10.  This time my accuracy score decreased to 0.853.  I also cross validated 10 times and saw no overfitting.  My final model I adjusted criterion to entropy, n_estimators = 500, n_jobs = 4, and my oob_score to True, which let us use use out-of-bag samples. This resulted in a slightly better model with an accuracy score of 0.856.  Cross validating again showed a lack of overfitting.  I ran a grid search as well to try and find my best number of estimators which resulted in my best number still being 500 like I used in my final model. My final Random Forest is my best model for this section.

# Bagging Classifier Models

I will next run Bagging Classifier classification models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be a model with n_estimators, or the number of different data sets, set to 101 and a random state set to 0.  I will continue to adjust my remaining models from there and may run a grid search to find my best number of estimators with the goal of maximizing the accuracy.

## Bagging Classifier Model - 1

In [298]:
#Bagging Classifer
from sklearn.ensemble import BaggingClassifier
clf_bag = BaggingClassifier(n_estimators=101, random_state=0) #101 different data sets. Number of estimators needs to be an odd number because your votes could end in a tie.
print(clf_bag)
clf_bag.fit(features_train, target_train)
predicted_bag=clf_bag.predict(features_test)
expected = target_test
print("Bagging Accuracy", accuracy_score(expected,predicted_bag))
print(classification_report(expected, predicted_bag,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_bag))

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=101, n_jobs=1, oob_score=False, random_state=0,
         verbose=0, warm_start=False)
Bagging Accuracy 0.8535268937278988
             precision    recall  f1-score   support

         No       0.89      0.92      0.91      8151
        Yes       0.72      0.64      0.68      2595

avg / total       0.85      0.85      0.85     10746

[[7520  631]
 [ 943 1652]]


## Cross Validate Bagging Classifier

In [299]:
#verify Random Forest with Cross Validation
scores_bc = cross_val_score(clf_bag, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_bc)
scores_bc.mean()  

Cross Validation Score for each K [0.87305225 0.86113657 0.8492209  0.86434464 0.84601283 0.85013749
 0.85786337 0.85740486 0.84869326 0.85045872]


0.8558324875763862

## Bagging Classifier Model - 2

In [336]:
#Bagging Classifer
from sklearn.ensemble import BaggingClassifier
clf_bag = BaggingClassifier(n_estimators=101, random_state=0, bootstrap_features = True)
print(clf_bag)
clf_bag.fit(features_train, target_train)
predicted_bag=clf_bag.predict(features_test)
expected = target_test
print("Bagging Accuracy", accuracy_score(expected,predicted_bag))
print(classification_report(expected, predicted_bag,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_bag))

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=True, max_features=1.0, max_samples=1.0,
         n_estimators=101, n_jobs=1, oob_score=False, random_state=0,
         verbose=0, warm_start=False)
Bagging Accuracy 0.861250697934115
             precision    recall  f1-score   support

         No       0.89      0.93      0.91      8151
        Yes       0.75      0.63      0.69      2595

avg / total       0.86      0.86      0.86     10746

[[7610  541]
 [ 950 1645]]


## Cross Validate Bagging Classifier

In [301]:
#verify Random Forest with Cross Validation
scores_bc = cross_val_score(clf_bag, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_bc)
scores_bc.mean()  

Cross Validation Score for each K [0.87396884 0.86159487 0.86480293 0.86296975 0.84784601 0.85426214
 0.86428244 0.8610729  0.85419532 0.85917431]


0.860416952304706

## Bagging Classifier Model - 3

In [307]:
#Bagging Classifer
from sklearn.ensemble import BaggingClassifier
clf_bag = BaggingClassifier(n_estimators=201, random_state = 0, n_jobs = 4, warm_start = True)
print(clf_bag)
clf_bag.fit(features_train, target_train)
predicted_bag=clf_bag.predict(features_test)
expected = target_test
print("Bagging Accuracy", accuracy_score(expected,predicted_bag))
print(classification_report(expected, predicted_bag,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_bag))

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=201, n_jobs=4, oob_score=False, random_state=0,
         verbose=0, warm_start=True)
Bagging Accuracy 0.854178298901917
             precision    recall  f1-score   support

         No       0.89      0.92      0.91      8151
        Yes       0.73      0.63      0.68      2595

avg / total       0.85      0.85      0.85     10746

[[7537  614]
 [ 953 1642]]


## Cross Validate Bagging Classifier

In [308]:
#verify Random Forest with Cross Validation
scores_bc = cross_val_score(clf_bag, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_bc)
scores_bc.mean()  

Cross Validation Score for each K [0.87351054 0.85976169 0.85105408 0.85701192 0.84601283 0.85426214
 0.85740486 0.85557084 0.84548372 0.85183486]


0.855190748356877

## Grid Search to narrow down n_estimators

In [337]:
# use a full grid over several parameters
from sklearn.model_selection import GridSearchCV
param_grid = {"n_estimators": [77, 101, 151]}

# run grid search
grid_search = GridSearchCV(clf_bag, param_grid=param_grid,n_jobs=-1)
grid_search.fit(features_train, target_train)
print("Best", grid_search.best_params_) 

Best {'n_estimators': 101}


I ran 3 separate Bagging Classifier models, all with slightly different parameters tuned.  The first model had n_estimators set to 101 and a random_state of 0 and got an accuracy score of 0.853.  I cross validated this model 10 times to check for overfitting and stayed within 2% on all models.  The second model I ran I added in an additional parameter setting the bootstrap_features to True.  This time my accuracy score increased to 0.861.  I also cross validated 10 times and saw no overfitting.  My final model I adjusted n_estimators to 201, removed the bootstrap_features parameter, set n_jobs to 4 and warm_start to True. This resulted in a slightly worse model with an accuracy score of 0.854.  Cross validating again showed a lack of overfitting.  I ran a grid search as well to try and find my best number of estimators against my second, best performing model, which resulted in my best number still being 101 like I used in my second model. My second Bagging Classifier is my best model for this section.

# SVM Linear Models

I will next run Support Vector Machine Linear models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be a model the kernel set to 'linear' since we are running a linear model, C = 1, class_weight = 'balanced', and gamma = 'auto'.  I will continue to adjust my remaining models from there and may run a grid search to find my best C with the goal of maximizing the accuracy.

## SVM Linear Model - 1

In [368]:
import time
start_time = time.clock()
from sklearn.svm import SVC
#standard linear SVC
clf_lin = SVC(kernel='linear', C=1.0, class_weight='balanced',gamma='auto')
clf_lin.fit(features_train, target_train)
predicted_SVM=clf_lin.predict(features_test)
expected = target_test
# summarize the fit of the model
print(classification_report(expected, predicted_SVM,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_SVM))
print(accuracy_score(expected,predicted_SVM))
print("Time to run", time.clock() - start_time, "seconds")

             precision    recall  f1-score   support

         No       0.94      0.77      0.85      8151
        Yes       0.54      0.86      0.66      2595

avg / total       0.85      0.79      0.80     10746

[[6237 1914]
 [ 369 2226]]
0.7875488553880514
Time to run 50.067325344221764 seconds


## Cross Validate SVM Linear Model

In [369]:
#verify SVM with Cross Validation
scores_SVML = cross_val_score(clf_lin, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_SVML)
scores_SVML.mean()  

Cross Validation Score for each K [0.80109991 0.77956004 0.77910174 0.79651696 0.77406049 0.77726856
 0.7826685  0.78954608 0.77624943 0.79954128]


0.7855612991098141

## SVM Linear Model - 2

In [370]:
start_time = time.clock()
clf_lin = SVC(kernel='linear', C=0.5, class_weight='balanced', random_state = 0)
clf_lin.fit(features_train, target_train)
predicted_SVM=clf_lin.predict(features_test)
expected = target_test
# summarize the fit of the model
print(classification_report(expected, predicted_SVM,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_SVM))
print(accuracy_score(expected,predicted_SVM))
print("Time to run", time.clock() - start_time, "seconds")

             precision    recall  f1-score   support

         No       0.94      0.76      0.84      8151
        Yes       0.54      0.86      0.66      2595

avg / total       0.85      0.79      0.80     10746

[[6231 1920]
 [ 369 2226]]
0.7869905080960358
Time to run 44.802260426711655 seconds


## Cross Validate SVM Linear Model

In [371]:
#verify SMV Linear with Cross Validation
scores_SVML = cross_val_score(clf_lin, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_SVML)
scores_SVML.mean()  

Cross Validation Score for each K [0.8015582  0.77956004 0.77910174 0.79743355 0.77497709 0.77772686
 0.78221    0.78954608 0.77624943 0.79908257]


0.7857445541081429

## Find the Best C Value

In [372]:
parameters = {'C':[.01,.05,1]} #have to experiment with which C's to use. 
svr = SVC(kernel='linear')
grid_svm = GridSearchCV(svr, parameters,n_jobs=-1, cv=5)
grid_svm.fit(features_train, target_train)
print("SCORES", grid_svm.cv_results_)
print("BEST SCORE", grid_svm.best_score_)
print("BEST PARAM", grid_svm.best_params_)
print("Time to run", time.clock() - start_time, "seconds")

SCORES {'mean_fit_time': array([38.69393368, 43.45027261, 54.13019481]), 'std_fit_time': array([0.29479063, 4.65361473, 6.66808474]), 'mean_score_time': array([5.33792605, 4.76717529, 4.29064469]), 'std_score_time': array([0.28207349, 0.29737702, 0.72671846]), 'param_C': masked_array(data=[0.01, 0.05, 1],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 0.01}, {'C': 0.05}, {'C': 1}], 'split0_test_score': array([0.85174152, 0.85426214, 0.85609533]), 'split1_test_score': array([0.84987394, 0.85308274, 0.85491634]), 'split2_test_score': array([0.84208114, 0.84597754, 0.84483154]), 'split3_test_score': array([0.84826954, 0.85079074, 0.85056154]), 'split4_test_score': array([0.84708849, 0.84731774, 0.84869326]), 'mean_test_score': array([0.84781114, 0.8502865 , 0.85101994]), 'std_test_score': array([0.0032641 , 0.00320191, 0.00411986]), 'rank_test_score': array([3, 2, 1]), 'split0_train_score': array([0.84831815, 0.8509541 , 0.85198

In [373]:
parameters = {'C':[3,4,9,10]} #have to experiment with which C's to use.
svr = SVC(kernel='linear')
grid_svm = GridSearchCV(svr, parameters,n_jobs=-1, cv=5)
grid_svm.fit(features_train, target_train)
print("SCORES", grid_svm.cv_results_)
print("BEST SCORE", grid_svm.best_score_)
print("BEST PARAM", grid_svm.best_params_)
print("Time to run", time.clock() - start_time, "seconds")

SCORES {'mean_fit_time': array([ 64.56446548,  70.53272147,  95.65852785, 102.05991058]), 'std_fit_time': array([2.39984652, 4.96114301, 5.07587043, 4.86419455]), 'mean_score_time': array([4.68372855, 4.65247626, 4.17992029, 4.04845419]), 'std_score_time': array([0.5197321 , 0.40905879, 0.2516956 , 0.82706492]), 'param_C': masked_array(data=[3, 4, 9, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 3}, {'C': 4}, {'C': 9}, {'C': 10}], 'split0_test_score': array([0.85563703, 0.85586618, 0.85586618, 0.85586618]), 'split1_test_score': array([0.85468714, 0.85468714, 0.85468714, 0.85468714]), 'split2_test_score': array([0.84437314, 0.84437314, 0.84437314, 0.84437314]), 'split3_test_score': array([0.85056154, 0.85056154, 0.85056154, 0.85056154]), 'split4_test_score': array([0.84892251, 0.84915177, 0.84892251, 0.84892251]), 'mean_test_score': array([0.85083658, 0.85092826, 0.85088242, 0.85088242]), 'std_test_score': array([

## SVM Linear Model - 3

In [374]:
start_time = time.clock()
clf_lin = SVC(kernel='linear', C=1)
clf_lin.fit(features_train, target_train)
predicted_SVM=clf_lin.predict(features_test)
expected = target_test
# summarize the fit of the model
print(classification_report(expected, predicted_SVM,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_SVM))
print(accuracy_score(expected,predicted_SVM))
print("Time to run", time.clock() - start_time, "seconds")

             precision    recall  f1-score   support

         No       0.88      0.94      0.91      8151
        Yes       0.75      0.59      0.66      2595

avg / total       0.85      0.85      0.85     10746

[[7631  520]
 [1070 1525]]
0.852037967615857
Time to run 51.91454774473095 seconds


## Cross Validate SVM Linear Model

In [375]:
#verify SVM Linear with Cross Validation
scores_SVML = cross_val_score(clf_lin, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_SVML)
scores_SVML.mean()  

Cross Validation Score for each K [0.85701192 0.85655362 0.84738772 0.86113657 0.84372136 0.84647113
 0.85098579 0.85098579 0.84135718 0.85825688]


0.8513867938822065

I ran 3 separate Support Vector Machine Linear models, all with slightly different parameters tuned.  The first model had kernel set to 'linear', as all 3 models did, C = 1.0, class_weight = 'balanced', and gamma = 'auto' and got an accuracy score of 0.787.  I cross validated this model 10 times to check for overfitting and stayed within 3% on all models.  The second model I ran I added in an additional parameter setting the random_state to 0, removed the gamma parameter, and changed my C to 0.5.  This time my accuracy score decreased slightly to 0.786.  I also cross validated 10 times and saw no overfitting.  I ran a grid search next to try and find my best C value. I ran 7 different C values through a basic SMV linear model with no additional parameters set and cross validated each 5 times. My best C value turned out to be a 1 with a model accuracy of 0.851. For my final model I adjusted C to 1 and had no other parameters set other than kernel = 'linear'. This resulted in my best accuracy score with my simplest model of 0.852.  Cross validating again showed a lack of overfitting.  My final SVM Linear is my best model for this section.

# Gradient Boosting Models

## Gradient Boosting Model - 1

In [309]:
from sklearn.ensemble import GradientBoostingClassifier
clf_GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=0.7, max_depth=1, random_state=0)
clf_GBC.fit(features_train, target_train)
predicted_GBC=clf_GBC.predict(features_test)
expected = target_test
print("Gradient Boost Accuracy", accuracy_score(expected,predicted_GBC))
print(classification_report(expected, predicted_GBC,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_GBC))


Gradient Boost Accuracy 0.8604131769960915
             precision    recall  f1-score   support

         No       0.88      0.94      0.91      8151
        Yes       0.77      0.61      0.68      2595

avg / total       0.85      0.86      0.85     10746

[[7671  480]
 [1020 1575]]


## Cross Validate Gradient Boosting

In [310]:
#verify Random Forest with Cross Validation
scores_gb = cross_val_score(clf_GBC, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_gb)
scores_gb.mean()  

Cross Validation Score for each K [0.88038497 0.86892759 0.86342805 0.86571952 0.85747021 0.85472044
 0.87116002 0.86336543 0.85602934 0.86834862]


0.8649554194335053

## Gradient Boosting Model - 2

In [347]:
clf_GBC = GradientBoostingClassifier(loss = 'exponential', n_estimators=200, learning_rate=0.9, max_depth=1, random_state=0)
clf_GBC.fit(features_train, target_train)
predicted_GBC=clf_GBC.predict(features_test)
expected = target_test
print("Gradient Boost Accuracy", accuracy_score(expected,predicted_GBC))
print(classification_report(expected, predicted_GBC,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_GBC))


Gradient Boost Accuracy 0.8644146659222036
             precision    recall  f1-score   support

         No       0.89      0.94      0.91      8151
        Yes       0.76      0.64      0.69      2595

avg / total       0.86      0.86      0.86     10746

[[7629  522]
 [ 935 1660]]


## Cross Validate Gradient Boosting

In [348]:
#verify Random Forest with Cross Validation
scores_gb = cross_val_score(clf_GBC, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_gb)
scores_gb.mean()  

Cross Validation Score for each K [0.88359303 0.87488543 0.86434464 0.87396884 0.85609533 0.85747021
 0.87253553 0.86382393 0.85923888 0.86651376]


0.8672469581059723

## Gradient Boosting Model - 3

In [327]:
clf_GBC = GradientBoostingClassifier(n_estimators=400, learning_rate=1.0, max_depth=1, random_state=0, criterion = 'mse')
clf_GBC.fit(features_train, target_train)
predicted_GBC=clf_GBC.predict(features_test)
expected = target_test
print("Gradient Boost Accuracy", accuracy_score(expected,predicted_GBC))
print(classification_report(expected, predicted_GBC,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_GBC))


Gradient Boost Accuracy 0.8568769774799926
             precision    recall  f1-score   support

         No       0.88      0.94      0.91      8151
        Yes       0.75      0.61      0.67      2595

avg / total       0.85      0.86      0.85     10746

[[7629  522]
 [1016 1579]]


## Cross Validate Gradient Boosting

In [328]:
#verify Random Forest with Cross Validation
scores_gb = cross_val_score(clf_GBC, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_gb)
scores_gb.mean()  

Cross Validation Score for each K [0.86984418 0.85976169 0.86205316 0.86434464 0.85426214 0.86388634
 0.86474094 0.85832187 0.84915177 0.86009174]


0.8606458477574108

## Grid Search to narrow down n_estimators

In [349]:
param_grid = {"n_estimators": [150, 200, 250]}

# run grid search
grid_search = GridSearchCV(clf_GBC, param_grid=param_grid,n_jobs=-1)
grid_search.fit(features_train, target_train)
print("Best", grid_search.best_params_) 

Best {'n_estimators': 200}


I ran 3 separate Gradient Boosting models, plus a gridsearch on the n_estimators, all with slightly different parameters tuned.  The first model had n_estimators set to 100, a random_state of 0, learning_rate of 0.7, and a max_depth of 1 and got an accuracy score of 0.860.  I cross validated this model 10 times to check for overfitting and stayed within 3% on all models.  The second model I ran I added in an additional parameter setting the loss to 'exponential', plus I changed the n_estimators to 200 and the learning_rate to 0.9.  This time my accuracy score increased to 0.864.  I also cross validated 10 times and saw no overfitting.  My final model I adjusted n_estimators to 400, removed the loss parameter, set learning_rate to 1 and criterion to 'mse'. This resulted in a slightly worse model with an accuracy score of 0.856.  Cross validating again showed a lack of overfitting.  I ran a grid search as well to try and find my best number of estimators against my second, best performing model, which resulted in my best number still being 200 like I used in my second model. My second Gradient Boosting model is my best model for this section.

# SVM RBF Models

I will next run Support Vector Machine RBF models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be a model the kernel set to 'rbf' since we are running a linear model, C = 1, class_weight = 'balanced', and gamma = 'auto'.  I will continue to adjust my remaining models from there and may run a grid search to find my best C with the goal of maximizing the accuracy.

## SVM RBF Model - 1

In [378]:
#standard RBF SVM Model
clf_rbf = SVC(kernel='rbf', C=1.0, class_weight='balanced',gamma='auto')
clf_rbf.fit(features_train, target_train)
predicted_SVM=clf_rbf.predict(features_test)
expected = target_test
# summarize the fit of the model
print(classification_report(expected, predicted_SVM,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_SVM))
print(accuracy_score(expected,predicted_SVM))
print("Time to run", time.clock() - start_time, "seconds")

             precision    recall  f1-score   support

         No       0.95      0.78      0.86      8151
        Yes       0.56      0.87      0.68      2595

avg / total       0.85      0.80      0.81     10746

[[6371 1780]
 [ 344 2251]]
0.8023450586264657
Time to run 2926.8777479404816 seconds


## Cross Validate SVM RBF Model

In [379]:
#verify SVM with Cross Validation
scores_SVMR = cross_val_score(clf_rbf, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_SVMR)
scores_SVMR.mean()  

Cross Validation Score for each K [0.81209899 0.79835014 0.79101742 0.80843263 0.78230981 0.78872594
 0.78908757 0.80146722 0.79046309 0.80458716]


0.7966539959758768

## SVM RBF Model - 2

In [380]:
start_time = time.clock()
clf_rbf = SVC(kernel='rbf', C=0.5, class_weight='balanced', random_state = 0)
clf_rbf.fit(features_train, target_train)
predicted_SVM=clf_rbf.predict(features_test)
expected = target_test
# summarize the fit of the model
print(classification_report(expected, predicted_SVM,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_SVM))
print(accuracy_score(expected,predicted_SVM))
print("Time to run", time.clock() - start_time, "seconds")

             precision    recall  f1-score   support

         No       0.95      0.77      0.85      8151
        Yes       0.54      0.87      0.67      2595

avg / total       0.85      0.79      0.80     10746

[[6248 1903]
 [ 341 2254]]
0.791178112786153
Time to run 58.81894648846355 seconds


## Cross Validate SVM RBF Model

In [381]:
scores_SVMR = cross_val_score(clf_rbf, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_SVMR)
scores_SVMR.mean()  

Cross Validation Score for each K [0.80064161 0.78735105 0.77956004 0.79789184 0.7731439  0.77772686
 0.77624943 0.79642366 0.77670793 0.79862385]


0.786432017815046

## Find the Best C Value

In [382]:
parameters = {'C':[.01,.05,1]} #have to experiment with which C's to use. 
svr = SVC(kernel='rbf')
grid_svm = GridSearchCV(svr, parameters,n_jobs=-1, cv=5)
grid_svm.fit(features_train, target_train)
print("SCORES", grid_svm.cv_results_)
print("BEST SCORE", grid_svm.best_score_)
print("BEST PARAM", grid_svm.best_params_)
print("Time to run", time.clock() - start_time, "seconds")

SCORES {'mean_fit_time': array([51.66957312, 47.1669734 , 36.21784201]), 'std_fit_time': array([1.32462389, 1.59379833, 2.78810512]), 'mean_score_time': array([7.49796715, 6.53334064, 4.52351556]), 'std_score_time': array([0.34181267, 0.15449999, 0.75941942]), 'param_C': masked_array(data=[0.01, 0.05, 1],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 0.01}, {'C': 0.05}, {'C': 1}], 'split0_test_score': array([0.78139322, 0.84601283, 0.8588451 ]), 'split1_test_score': array([0.78294751, 0.83795554, 0.85033234]), 'split2_test_score': array([0.78203071, 0.83543433, 0.84437314]), 'split3_test_score': array([0.78271831, 0.83635113, 0.85720834]), 'split4_test_score': array([0.78037597, 0.83677212, 0.84777625]), 'mean_test_score': array([0.78189319, 0.83850562, 0.85170754]), 'std_test_score': array([0.00093411, 0.00384064, 0.00551999]), 'rank_test_score': array([3, 2, 1]), 'split0_train_score': array([0.77961148, 0.83697209, 0.85250

In [383]:
parameters = {'C':[3,4,9,10]} #have to experiment with which C's to use.
svr = SVC(kernel='rbf')
grid_svm = GridSearchCV(svr, parameters,n_jobs=-1, cv=5)
grid_svm.fit(features_train, target_train)
print("SCORES", grid_svm.cv_results_)
print("BEST SCORE", grid_svm.best_score_)
print("BEST PARAM", grid_svm.best_params_)
print("Time to run", time.clock() - start_time, "seconds")

SCORES {'mean_fit_time': array([41.64142442, 37.3972538 , 37.93918581, 41.21489067]), 'std_fit_time': array([1.52497066, 1.48762975, 0.36958532, 1.51968874]), 'mean_score_time': array([6.04763503, 4.55941014, 4.43094711, 4.37291465]), 'std_score_time': array([0.78420407, 0.12081133, 0.09255344, 0.05092132]), 'param_C': masked_array(data=[3, 4, 9, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 3}, {'C': 4}, {'C': 9}, {'C': 10}], 'split0_test_score': array([0.8627406 , 0.8631989 , 0.86388634, 0.8631989 ]), 'split1_test_score': array([0.85377034, 0.85491634, 0.85422874, 0.85377034]), 'split2_test_score': array([0.84620674, 0.84506074, 0.84804034, 0.84758194]), 'split3_test_score': array([0.85904194, 0.85927114, 0.85858354, 0.85835434]), 'split4_test_score': array([0.8519028 , 0.8523613 , 0.85006878, 0.85006878]), 'mean_test_score': array([0.85473298, 0.85496218, 0.85496218, 0.85459546]), 'std_test_score': array([0.00

## SVM RBF Model - 3

In [384]:
start_time = time.clock()
clf_rbf = SVC(kernel='rbf', C=4)
clf_rbf.fit(features_train, target_train)
predicted_SVM=clf_rbf.predict(features_test)
expected = target_test
# summarize the fit of the model
print(classification_report(expected, predicted_SVM,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_SVM))
print(accuracy_score(expected,predicted_SVM))
print("Time to run", time.clock() - start_time, "seconds")

             precision    recall  f1-score   support

         No       0.88      0.94      0.91      8151
        Yes       0.76      0.59      0.66      2595

avg / total       0.85      0.86      0.85     10746

[[7662  489]
 [1066 1529]]
0.8552949934859483
Time to run 37.43058285675943 seconds


## Cross Validate SVM RBF Model

In [385]:
scores_SVMR = cross_val_score(clf_rbf, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_SVMR)
scores_SVMR.mean()  

Cross Validation Score for each K [0.86892759 0.86067828 0.85334555 0.85792851 0.84463795 0.84647113
 0.86382393 0.85694635 0.84181568 0.86146789]


0.85560428605635

I ran 3 separate Support Vector Machine RBF models, all with slightly different parameters tuned.  The first model had kernel set to 'rbf', as all 3 models did, C = 1.0, class_weight = 'balanced', and gamma = 'auto' and got an accuracy score of 0.802.  I cross validated this model 10 times to check for overfitting and stayed within 2% on all models.  The second model I ran I added in an additional parameter setting the random_state to 0, removed the gamma parameter, and changed my C to 0.5.  This time my accuracy score decreased slightly to 0.791.  I also cross validated 10 times and saw no overfitting.  I ran a grid search next to try and find my best C value. I ran 7 different C values through a basic SMV RBF model with no additional parameters set and cross validated each 5 times. My best C value turned out to be a 4 with a model accuracy of 0.854. For my final model I adjusted C to 4 and had no other parameters set other than kernel = 'rbf'. This resulted in my best accuracy score with my simplest model of 0.855.  Cross validating again showed a lack of overfitting.  My final SVM RBF model is the best for this section.

# Extra Trees Models

I will next run Extra Trees classification models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be a basic Extra Trees model with no default parameters tuned, meaning we will have 10 trees.  I will continue to adjust my remaining models from there and may run a grid search on the number of estimators used with the goal of maximizing the accuracy.

## Extra Trees Model - 1

In [273]:
from sklearn.ensemble import ExtraTreesClassifier
xdt = ExtraTreesClassifier()
xdt.fit(features_train, target_train)
predicted_xdt=xdt.predict(features_test)
expected = target_test
print("Extra Trees", accuracy_score(expected,predicted_xdt))
print(classification_report(expected, predicted_xdt,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_xdt))

Extra Trees 0.8257025870091197
             precision    recall  f1-score   support

         No       0.86      0.91      0.89      8151
        Yes       0.67      0.55      0.60      2595

avg / total       0.82      0.83      0.82     10746

[[7445  706]
 [1167 1428]]


## Cross Validate Extra Trees

In [274]:
#verify Extra Trees with Cross Validation
scores_et = cross_val_score(xdt, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_et)
scores_et.mean()  

Cross Validation Score for each K [0.83593034 0.82951421 0.83272227 0.82813932 0.82813932 0.83409716
 0.82301696 0.81522238 0.82255846 0.82844037]


0.8277780787595379

## Extra Trees Model - 2

In [341]:
xdt = ExtraTreesClassifier(criterion = 'entropy',n_estimators=40, warm_start = True, class_weight = 'balanced_subsample')
xdt.fit(features_train, target_train)
predicted_xdt=xdt.predict(features_test)
expected = target_test
print("Extra Trees", accuracy_score(expected,predicted_xdt))
print(classification_report(expected, predicted_xdt,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_xdt))

Extra Trees 0.8299832495812395
             precision    recall  f1-score   support

         No       0.87      0.91      0.89      8151
        Yes       0.67      0.58      0.62      2595

avg / total       0.82      0.83      0.83     10746

[[7408  743]
 [1084 1511]]


## Cross Validate Extra Trees

In [342]:
#verify Extra Trees with Cross Validation
scores_et = cross_val_score(xdt, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_et)
scores_et.mean()  

Cross Validation Score for each K [0.83913841 0.83363886 0.83226398 0.83547204 0.82722273 0.83226398
 0.83218707 0.82072444 0.82393398 0.83669725]


0.8313542731429486

## Extra Trees Model - 3

In [343]:
xdt = ExtraTreesClassifier(criterion = 'entropy', n_estimators=100, class_weight = 'balanced', n_jobs = 4)
xdt.fit(features_train, target_train)
predicted_xdt=xdt.predict(features_test)
expected = target_test
print("Extra Trees", accuracy_score(expected,predicted_xdt))
print(classification_report(expected, predicted_xdt,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_xdt))

Extra Trees 0.8344500279173646
             precision    recall  f1-score   support

         No       0.88      0.91      0.89      8151
        Yes       0.68      0.60      0.63      2595

avg / total       0.83      0.83      0.83     10746

[[7422  729]
 [1050 1545]]


## Cross Validate Extra Trees

In [344]:
#verify Extra Trees with Cross Validation
scores_et = cross_val_score(xdt, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_et)
scores_et.mean()  

Cross Validation Score for each K [0.84463795 0.83363886 0.83501375 0.83776352 0.83088909 0.83776352
 0.83127006 0.82164145 0.8262265  0.83394495]


0.8332789655324632

## Grid Search to narrow down n_estimators

In [345]:
# use a full grid over several parameters
param_grid = {"n_estimators": [70, 100, 150]}

# run grid search
grid_search = GridSearchCV(xdt, param_grid=param_grid,n_jobs=-1)
grid_search.fit(features_train, target_train)
print("Best", grid_search.best_params_) 

Best {'n_estimators': 70}


## Extra Trees Model - 4

In [346]:
xdt = ExtraTreesClassifier(criterion = 'entropy', n_estimators=70, class_weight = 'balanced', n_jobs = 4)
xdt.fit(features_train, target_train)
predicted_xdt=xdt.predict(features_test)
expected = target_test
print("Extra Trees", accuracy_score(expected,predicted_xdt))
print(classification_report(expected, predicted_xdt,target_names=['No', 'Yes']))
print(confusion_matrix(expected, predicted_xdt))

Extra Trees 0.8330541596873255
             precision    recall  f1-score   support

         No       0.87      0.91      0.89      8151
        Yes       0.68      0.59      0.63      2595

avg / total       0.83      0.83      0.83     10746

[[7419  732]
 [1062 1533]]


## Cross Validate Extra Trees

In [344]:
#verify Extra Trees with Cross Validation
scores_et = cross_val_score(xdt, features_train, target_train, cv=10)  
print("Cross Validation Score for each K",scores_et)
scores_et.mean()  

Cross Validation Score for each K [0.84463795 0.83363886 0.83501375 0.83776352 0.83088909 0.83776352
 0.83127006 0.82164145 0.8262265  0.83394495]


0.8332789655324632

I ran 4 separate Extra Tress models, all with slightly different parameters tuned.  The first model I ran was the base model got an accuracy score of 0.825.  I cross validated this model 10 times to check for overfitting and stayed within 2% on all models.  The second model I ran I adjusted the criterion to 'entropy', n_estimators to 40 from the default of 10, set warm_start to True, and set the class_weight to 'balanced_subsample'.  This time my accuracy score increased to 0.829.  I also cross validated 10 times and saw no overfitting.  My third model I adjusted n_estimators to 100, kept criterion equal to 'entropy' and class_weight equal to 'balanced', and now adjusted n_jobs to 4, while removing the class_weight parameter.  This resulted in a slightly better model with an accuracy score of 0.834.  Cross validating again showed a lack of overfitting.  Finally, I ran a grid search to find my best n_estimators number, which turned out to be 70.  I ran one final model, similar to model 3 but with n_estimators at 70, and got an accuracy score of 0.833. My third Extra Trees Model is my best model for this section.

# ANN Models

I will next run ANN classification models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will be a basic MLPClassifier model with no default parameters tuned.  I will continue to adjust my remaining models from there with the goal of maximizing the accuracy.

## ANN Model - 1

In [409]:
from sklearn.neural_network import MLPClassifier
clf_ann =MLPClassifier()
clf_ann.fit(features_train, target_train)
target_predicted_ann = clf_ann.predict(features_test)
print("Accuracy", accuracy_score(target_test, target_predicted_ann))
target_names = ["Salaray <= 50k", "Salary >= 50k"]
print(classification_report(target_test, target_predicted_ann, target_names=target_names))
print(confusion_matrix(target_test, target_predicted_ann))

Accuracy 0.85250325702587
                precision    recall  f1-score   support

Salaray <= 50k       0.90      0.91      0.90      8151
 Salary >= 50k       0.70      0.67      0.69      2595

   avg / total       0.85      0.85      0.85     10746

[[7420  731]
 [ 854 1741]]


## ANN Model - 2

In [422]:
clf_ann =MLPClassifier(hidden_layer_sizes=(10,10,10), max_iter=500, alpha=1e-05,
                     random_state=21,tol=0.000000001)
clf_ann.fit(features_train, target_train)
target_predicted_ann = clf_ann.predict(features_test)
print("Accuracy", accuracy_score(target_test, target_predicted_ann))
target_names = ["Salaray <= 50k", "Salary >= 50k"]
print(classification_report(target_test, target_predicted_ann, target_names=target_names))
print(confusion_matrix(target_test, target_predicted_ann))

Accuracy 0.8575283826540108
                precision    recall  f1-score   support

Salaray <= 50k       0.89      0.93      0.91      8151
 Salary >= 50k       0.74      0.63      0.68      2595

   avg / total       0.85      0.86      0.85     10746

[[7579  572]
 [ 959 1636]]


## ANN Model - 3

In [421]:
clf_ann =MLPClassifier(hidden_layer_sizes=(5,5,5,5), max_iter=500, alpha=1e-05,
                     random_state=21,tol=0.000000001)
clf_ann.fit(features_train, target_train)
target_predicted_ann = clf_ann.predict(features_test)
print("Accuracy", accuracy_score(target_test, target_predicted_ann))
target_names = ["Salaray <= 50k", "Salary >= 50k"]
print(classification_report(target_test, target_predicted_ann, target_names=target_names))
print(confusion_matrix(target_test, target_predicted_ann))

Accuracy 0.8580867299460264
                precision    recall  f1-score   support

Salaray <= 50k       0.88      0.94      0.91      8151
 Salary >= 50k       0.75      0.61      0.68      2595

   avg / total       0.85      0.86      0.85     10746

[[7627  524]
 [1001 1594]]


## ANN Model - 4

In [427]:
clf_ann =MLPClassifier(hidden_layer_sizes=(5,5,5,5), max_iter=200, alpha=0.4,
                     random_state=21,tol=0.000000001)
clf_ann.fit(features_train, target_train)
target_predicted_ann = clf_ann.predict(features_test)
print("Accuracy", accuracy_score(target_test, target_predicted_ann))
target_names = ["Salaray <= 50k", "Salary >= 50k"]
print(classification_report(target_test, target_predicted_ann, target_names=target_names))
print(confusion_matrix(target_test, target_predicted_ann))

Accuracy 0.8577144984180161
                precision    recall  f1-score   support

Salaray <= 50k       0.89      0.93      0.91      8151
 Salary >= 50k       0.74      0.64      0.68      2595

   avg / total       0.85      0.86      0.85     10746

[[7569  582]
 [ 947 1648]]


## ANN Model - 5

In [428]:
clf_ann =MLPClassifier(hidden_layer_sizes=(100,100), max_iter=500, alpha=1e-05,
                     random_state=21,tol=0.000000001)
clf_ann.fit(features_train, target_train)
target_predicted_ann = clf_ann.predict(features_test)
print("Accuracy", accuracy_score(target_test, target_predicted_ann))
target_names = ["Salaray <= 50k", "Salary >= 50k"]
print(classification_report(target_test, target_predicted_ann, target_names=target_names))
print(confusion_matrix(target_test, target_predicted_ann))

Accuracy 0.8324027545133073
                precision    recall  f1-score   support

Salaray <= 50k       0.88      0.90      0.89      8151
 Salary >= 50k       0.66      0.62      0.64      2595

   avg / total       0.83      0.83      0.83     10746

[[7337  814]
 [ 987 1608]]


## ANN Model - 6

In [431]:
clf_ann =MLPClassifier(hidden_layer_sizes=(20,20,20,20), max_iter=200, alpha=1e-05,
                     random_state=21,tol=0.000000001)
clf_ann.fit(features_train, target_train)
# test random forest model
target_predicted_ann = clf_ann.predict(features_test)
print("Accuracy", accuracy_score(target_test, target_predicted_ann))
target_names = ["Salaray <= 50k", "Salary >= 50k"]
print(classification_report(target_test, target_predicted_ann, target_names=target_names))
print(confusion_matrix(target_test, target_predicted_ann))

Accuracy 0.8384515168434766
                precision    recall  f1-score   support

Salaray <= 50k       0.90      0.89      0.89      8151
 Salary >= 50k       0.66      0.68      0.67      2595

   avg / total       0.84      0.84      0.84     10746

[[7243  908]
 [ 828 1767]]


I ran 6 separate ANN, all with slightly different parameters tuned.  The first model I ran was the base model got an accuracy score of 0.852.  The second model I ran I adjusted my hidden layers to 10,10,10, my max_iter to 500, my alpha to 1e-05, random_state to 21, and tol to 0.000000001.  This time my accuracy score increased to 0.857.  My third model I adjusted my hidden layers to 5,5,5,5 and kept all other parameters the same as model 2.  This resulted in a slightly better model with an accuracy score of 0.858.  My fourth model I kept my hidden layers at 5,5,5,5 and adjusted my max_iter to 200 and alpha to 0.4 while the random state and tol remained the same.  This resulted in an accuracy score of 0.857.  My fifth model I adjusted the hidden layers to 100,100, max_iter to 500, put the alpha back to 1e-05, and kept random_state and tol the same.  This gave me an accuracy score of 0.832. Finally, I ran a model with hidden layers of 20,20,20,20, max_iter of 200, and kept my alpha, random_state, and tol the same as model 5. This gave me an accuracy score of 0.838. My third ANN is my best model for this section.

# Stacking Models

I will next run Stacking models.  I will run a minimum of 3 models for this section, each with different tuning parameters to try and get a higher accuracy score each time.  The first model will my top 3 models in terms of accuracy score that I have developed thus far.  I will continue to adjust my remaining models from there with the goal of maximizing the accuracy.

## Stacking Model - 1

In [437]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
#Three Models Gradient Boosting, Bagging Classifier, ANN
clf1 = GradientBoostingClassifier(loss = 'exponential', n_estimators=200, learning_rate=0.9, max_depth=1, random_state=0)
clf2 = BaggingClassifier(n_estimators=101, random_state=0, bootstrap_features = True)
clf3 = MLPClassifier(hidden_layer_sizes=(5,5,5,5), max_iter=500, alpha=1e-05,
                     random_state=21,tol=0.000000001)
eclf2 = VotingClassifier(estimators=[('gb', clf1), ('bc', clf2), ('ann', clf3)], voting='hard')
for MV, label in zip([clf1, clf2, clf3, eclf2], ['Gradient Boosting Classifier', 'Bagging Classifier', 'ANN', 'Ensemble']):

    scores2 = cross_val_score(MV, features_train, target_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.6f (+/- %0.2f) [%s]" % (scores2.mean(), scores2.std(), label))

Accuracy: 0.867751 (+/- 0.01) [Gradient Boosting Classifier]
Accuracy: 0.858675 (+/- 0.00) [Bagging Classifier]
Accuracy: 0.856153 (+/- 0.01) [ANN]
Accuracy: 0.866238 (+/- 0.01) [Ensemble]


## Stacking Model - 2

In [438]:
#Three Models Gradient Boosting, Bagging Classifier, ANN
clf1 = GradientBoostingClassifier()
clf2 = BaggingClassifier()
clf3 = MLPClassifier()
eclf2 = VotingClassifier(estimators=[('gb', clf1), ('bc', clf2), ('ann', clf3)], voting='hard')
for MV, label in zip([clf1, clf2, clf3, eclf2], ['Gradient Boosting Classifier', 'Bagging Classifier', 'ANN', 'Ensemble']):

    scores2 = cross_val_score(MV, features_train, target_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.6f (+/- %0.2f) [%s]" % (scores2.mean(), scores2.std(), label))

Accuracy: 0.867201 (+/- 0.01) [Gradient Boosting Classifier]
Accuracy: 0.849323 (+/- 0.00) [Bagging Classifier]
Accuracy: 0.848407 (+/- 0.00) [ANN]
Accuracy: 0.864313 (+/- 0.00) [Ensemble]


## Stacking Model - 3

In [439]:
#Three Models KNN, Decision Tree, Random Forest, Extra Trees
clf1 = KNeighborsClassifier()
clf2 = tree.DecisionTreeClassifier()
clf3 = RandomForestClassifier()
clf4 = ExtraTreesClassifier()
eclf2 = VotingClassifier(estimators=[('knn', clf1), ('dt', clf2), ('rf', clf3), ('et', clf4)], voting='hard')
for MV, label in zip([clf1, clf2, clf3, clf4, eclf2], ['KNN', 'Decision Tree', 'Random Forest', 'Extra Trees', 'Ensemble']):

    scores2 = cross_val_score(MV, features_train, target_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.6f (+/- %0.2f) [%s]" % (scores2.mean(), scores2.std(), label))

Accuracy: 0.831125 (+/- 0.01) [KNN]
Accuracy: 0.813247 (+/- 0.01) [Decision Tree]
Accuracy: 0.847948 (+/- 0.00) [Random Forest]
Accuracy: 0.827733 (+/- 0.01) [Extra Trees]
Accuracy: 0.843639 (+/- 0.00) [Ensemble]


## Stacking Model - 4

In [440]:
#Three Models KNN, Decision Tree, Random Forest, Extra Trees
clf1 = KNeighborsClassifier(n_neighbors = 15, p = 1, weights = 'distance')
clf2 = tree.DecisionTreeClassifier(criterion = 'gini', min_samples_split = 5)
clf3 = RandomForestClassifier(n_estimators= 500, n_jobs=4,oob_score=True, criterion = 'entropy')
clf4 = ExtraTreesClassifier(criterion = 'entropy', n_estimators=100, class_weight = 'balanced', n_jobs = 4)
eclf2 = VotingClassifier(estimators=[('knn', clf1), ('dt', clf2), ('rf', clf3), ('et', clf4)], voting='hard')
for MV, label in zip([clf1, clf2, clf3, clf4, eclf2], ['KNN', 'Decision Tree', 'Random Forest', 'Extra Trees', 'Ensemble']):

    scores2 = cross_val_score(MV, features_train, target_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.6f (+/- %0.2f) [%s]" % (scores2.mean(), scores2.std(), label))

Accuracy: 0.837359 (+/- 0.01) [KNN]
Accuracy: 0.819160 (+/- 0.01) [Decision Tree]
Accuracy: 0.854641 (+/- 0.00) [Random Forest]
Accuracy: 0.832684 (+/- 0.01) [Extra Trees]
Accuracy: 0.847949 (+/- 0.00) [Ensemble]


I ran 4 separate Stacking models, all with slightly different parameters tuned.  The first model I ran was used my top 3 models from the above analysis, including Gradient Boosting 1 and 2 and the Bagging Classifier 2 which gave me an accuracy score of 0.866.  My second stacking model I used the base Gradient Boosting and Bagging Classifier models to try and simplify the model and got an accuracy score of 0.864.  My third model used the base KNN, Decision Tree, Random Forest, and Extra Trees models and got an accuracy score of 0.843.  My final model used those same four models but with parameters tuned within them that were the same as the third model I ran for each of them above, getting an accuracy score of 0.847.  My first stacking model is my best model.