# 2. Project 4 - Modeling

In [1]:
# standard imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time
import re
# bs4, nltk, and sklearn imports
from bs4 import BeautifulSoup             
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
train = pd.read_csv('./data/train_clean.csv')
test = pd.read_csv('./data/test_clean.csv')

In [3]:
X = train.drop(columns = ['fnlwgt','wage','workclass','education','education-num','occupation','relationship','marital-status','capital-gain','capital-loss'])
y = train['wage']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
y.value_counts(normalize = True)

0    0.759251
1    0.240749
Name: wage, dtype: float64

In [6]:
def search_fit(est, X, y, params, name = None, scale = False):
    #split the data into training and testing
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state = 42, stratify = y)
    #initialize the gridsearch with the passed estimator object
    gs = GridSearchCV(est, params, cv = 5, n_jobs=5)
    #scale the data if scale = True
    if scale:
        sc = StandardScaler()
        X_tr = sc.fit_transform(X_tr)
        X_te = sc.fit_transform(X_te)
    #Fit training data, print scores and params, then return the best estimator
    gs.fit(X_tr, y_tr)
    print(f'{name} Training Score : {gs.score(X_tr, y_tr)}')
    print(f'{name} Testing Score  : {gs.score(X_te, y_te)}')
    print(f'Best Params: {gs.best_params_}')
    return gs.best_estimator_.fit(X_tr, y_tr)

### Decision Tree

In [7]:
dt = search_fit(DecisionTreeClassifier(), X, y,
            params = {
                'max_depth' : [7],
                'min_samples_split' : [4],
                'max_features' : [None]
            },
           name = "Decision Tree")

Decision Tree Training Score : 0.8476658476658476
Decision Tree Testing Score  : 0.8238182934315531
Best Params: {'max_depth': 7, 'max_features': None, 'min_samples_split': 4}


### Bagged Decision Trees

In [8]:
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth = 7, min_samples_split = 4))

In [9]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=7,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=4,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,
   

In [10]:
bag.score(X_train, y_train)

0.8562653562653563

In [11]:
bag.score(X_test, y_test)

0.8238182934315531

### Random Forest

In [12]:
rf = search_fit(RandomForestClassifier(), X, y,
            params = {
                'n_estimators' : [110],
                'max_depth' : [50],
                'min_samples_split' : [10],
                'max_features' : ['auto'],
                'ccp_alpha' : [0.0001]
            },
           name = "Random Forest")

Random Forest Training Score : 0.9011056511056511
Random Forest Testing Score  : 0.8275015346838551
Best Params: {'ccp_alpha': 0.0001, 'max_depth': 50, 'max_features': 'auto', 'min_samples_split': 10, 'n_estimators': 110}


In [13]:
bag = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators = 110, max_depth = 50, min_samples_split = 10, max_features = 'auto'))

In [14]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True,
                                                        ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=50,
                                                        max_features='auto',
                                                        max_leaf_nodes=None,
                                                        max_samples=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=10,
                                                        min_weight_fraction_leaf=0.0,
      

In [15]:
bag.score(X_train, y_train)

0.8851351351351351

In [16]:
bag.score(X_test, y_test)

0.8219766728054021

### Extra Trees

In [17]:
et = search_fit(ExtraTreesClassifier(), X, y,
            params = {
                'n_estimators' : [150],
                'max_depth' : [15],
                'min_samples_split' : [3],
                'max_features' : ['auto'],
                'ccp_alpha' : [0.0001]
            },
           name = "Extra Trees")

Extra Trees Training Score : 0.8849303849303849
Extra Trees Testing Score  : 0.8256599140577041
Best Params: {'ccp_alpha': 0.0001, 'max_depth': 15, 'max_features': 'auto', 'min_samples_split': 3, 'n_estimators': 150}


In [18]:
bag = BaggingClassifier(base_estimator=ExtraTreesClassifier(n_estimators = 150, max_depth = 15, min_samples_split = 10, max_features = 'auto', ccp_alpha = 0.0001))

In [19]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=ExtraTreesClassifier(bootstrap=False,
                                                      ccp_alpha=0.0001,
                                                      class_weight=None,
                                                      criterion='gini',
                                                      max_depth=15,
                                                      max_features='auto',
                                                      max_leaf_nodes=None,
                                                      max_samples=None,
                                                      min_impurity_decrease=0.0,
                                                      min_impurity_split=None,
                                                      min_samples_leaf=1,
                                                      min_samples_split=10,
                                                      min_weight_fraction_leaf=0.0,
                            

In [20]:
print(f'Training Score : {bag.score(X_train, y_train)}')
print(f'Testing Score  : {bag.score(X_test, y_test)}')

Training Score : 0.8576986076986077
Testing Score  : 0.8244321669736034


### Logistic Regression

In [21]:
logreg = search_fit(LogisticRegression(), X, y,
            params = {
                'C' : [0.5],
            },
           name = "Logistic Regression", scale = True)

Logistic Regression Training Score : 0.8357903357903358
Logistic Regression Testing Score  : 0.8317986494782075
Best Params: {'C': 0.5}


In [22]:
bag = BaggingClassifier(base_estimator=LogisticRegression(C = 0.25, max_iter = 10000))

In [23]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(C=0.25, class_weight=None,
                                                    dual=False,
                                                    fit_intercept=True,
                                                    intercept_scaling=1,
                                                    l1_ratio=None,
                                                    max_iter=10000,
                                                    multi_class='auto',
                                                    n_jobs=None, penalty='l2',
                                                    random_state=None,
                                                    solver='lbfgs', tol=0.0001,
                                                    verbose=0,
                                                    warm_start=False),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=1.0, n_estimators=10, n_jobs=None

In [24]:
print(f'Training Score : {bag.score(X_train, y_train)}')
print(f'Testing Score  : {bag.score(X_test, y_test)}')

Training Score : 0.8370188370188371
Testing Score  : 0.8244321669736034


### KNN

In [25]:
knn = search_fit(KNeighborsClassifier(), X, y,
            params = {
                'n_neighbors' : [21,25],
            },
           name = "KNN", scale = True)

KNN Training Score : 0.8327190827190827
KNN Testing Score  : 0.8158379373848987
Best Params: {'n_neighbors': 25}


In [26]:
bag = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors = 21))

In [27]:
#unscaled

bag.fit(X_train, y_train)
print(f'Training Score : {bag.score(X_train, y_train)}')
print(f'Testing Score  : {bag.score(X_test, y_test)}')

Training Score : 0.814086814086814
Testing Score  : 0.7900552486187845


### SVC

In [28]:
svc = search_fit(SVC(), X, y,
            params = {
            },
           name = "SVC", scale = True)

SVC Training Score : 0.8527846027846028
SVC Testing Score  : 0.8293431553100061
Best Params: {}


In [29]:
bag = BaggingClassifier(base_estimator=SVC())

In [30]:
#unscaled

bag.fit(X_train, y_train)
print(f'Training Score : {bag.score(X_train, y_train)}')
print(f'Testing Score  : {bag.score(X_test, y_test)}')

Training Score : 0.759009009009009
Testing Score  : 0.7599754450583179


### AdaBoost

In [31]:
Adaboost = search_fit(AdaBoostClassifier(random_state = 42), X, y,
            params = {
                'n_estimators': [25,50,75,100],
                'learning_rate': [0.4,0.5,0.7]
            },
           name = "AdaBoost Classifier", scale = False)

AdaBoost Classifier Training Score : 0.8329238329238329
AdaBoost Classifier Testing Score  : 0.8416206261510129
Best Params: {'learning_rate': 0.4, 'n_estimators': 75}


### Gradient Boost

In [32]:
gboost = search_fit(GradientBoostingClassifier(random_state = 42), X, y,
            params = {
                'max_depth': [4],
                'n_estimators': [100],
                'learning_rate': [0.12],
                'subsample': [1.0]
            },
           name = "GBoost", scale = False)

GBoost Training Score : 0.8703931203931204
GBoost Testing Score  : 0.8397790055248618
Best Params: {'learning_rate': 0.12, 'max_depth': 4, 'n_estimators': 100, 'subsample': 1.0}


### Naive-Bayes

In [33]:
nb = search_fit(MultinomialNB(), X, y,
            params = {
            },
           name = "nb", scale = False)

nb Training Score : 0.8091728091728092
nb Testing Score  : 0.8078575813382444
Best Params: {}


|                    | Decision Tree      | Random Forest      | Extra Trees        | Logistic Regression | KNN                | SVC                | AdaBoost           | Gradient Boost     | Naive-Bayes        |
|--------------------|--------------------|--------------------|--------------------|---------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
| Train Score        | 0.8673218673218673 | 0.9232186732186732 | 0.895986895986896  | 0.855036855036855   | 0.8368140868140869 | 0.8693693693693694 | 0.8560606060606061 | 0.8931203931203932 | 0.7811220311220312 |
| Test Score         | 0.8551258440761204 | 0.8655616942909761 | 0.8354818907305095 | 0.8471454880294659  | 0.8170656844689994 | 0.8440761203192142 | 0.8557397176181707 | 0.8686310620012277 | 0.7833026396562308 |
| Bagged Train Score | 0.8734643734643734 | 0.9037674037674037 | 0.8705978705978706 | 0.8533988533988534  | 0.842956592956593  |                    |                    |                    |                    |
| Bagged Test Score  | 0.856353591160221  | 0.8465316144874155 | 0.8268876611418048 | 0.8428483732351135  | 0.8084714548802947 |                    |                    |                    |                    |