# 7 - Iterative Modeling Untransformed Data (Part 2)

## 7.1 Set Up & Data Initialization 

In [31]:
#Libraries
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score 

from functions import *  
import pickle 

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore") 

In [7]:
df = pd.read_pickle("./df.pkl")
target = df['status_group'] 
features = df.drop('status_group', axis=1)  
X = pd.get_dummies(features)  
data = pd.get_dummies(features) 
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = 0.25, random_state=42)

In [8]:
# Load from file
with open("rfmodel1.pkl", 'rb') as file:
    pickle_model = pickle.load(file)

## 7.4 GridSearchCV Random Forest
The purpose of this model is to use GridSearchCV to look through parameters for predictive importance.

In [9]:
rf_param_grid = {
    'n_estimators': [10, 30, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 6, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 6]
}

In [10]:
rf_grid_search = GridSearchCV(pickle_model, rf_param_grid, cv=3)
rf_grid_search.fit(data_train, target_train)

print(f"Training Accuracy: {rf_grid_search.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {rf_grid_search.best_params_}")

Training Accuracy: 73.77%

Optimal Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 100}


In [11]:
rf_score = rf_grid_search.score(data_test, target_test)


print('Random forest grid search: ', rf_score)

Random forest grid search:  0.7414141414141414


This model did 13% better than the untuned random forest! 

## 7.5 Adaboost 
The purpose of this model is to see how the preformance increases with Adaboost.

In [17]:
adaboost_clf = AdaBoostClassifier(random_state=42)

In [20]:
adaboost_clf.fit(data_train, target_train)

AdaBoostClassifier(random_state=42)

In [22]:
adaboost_train_preds = adaboost_clf.predict(data_train)
adaboost_test_preds = adaboost_clf.predict(data_test)

In [24]:
adaboost_confusion_matrix = confusion_matrix(target_test, adaboost_test_preds)
adaboost_confusion_matrix

array([[7227,   82,  789],
       [ 781,  117,  176],
       [2231,   66, 3381]])

In [25]:
adaboost_classification_report = classification_report(target_test, adaboost_test_preds)
print(adaboost_classification_report)

                         precision    recall  f1-score   support

             functional       0.71      0.89      0.79      8098
functional needs repair       0.44      0.11      0.17      1074
         non functional       0.78      0.60      0.67      5678

               accuracy                           0.72     14850
              macro avg       0.64      0.53      0.55     14850
           weighted avg       0.71      0.72      0.70     14850



In [28]:
print('Mean Adaboost Cross-Val Score (k=5):')
print(cross_val_score(adaboost_clf, data_test, target_test, cv=5).mean())

Mean Adaboost Cross-Val Score (k=5):
0.7203367003367004


In [29]:
print("Testing Accuracy for Decision Tree Classifier: {:.4}%".format(accuracy_score(target_test, adaboost_test_preds) * 100))

Testing Accuracy for Decision Tree Classifier: 72.22%
