# 8 Iterative Modeling Processed Data

## 8.1 Set Up & Data Initialization 

In [273]:
#Libraries
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt 

import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier  
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn_pandas import DataFrameMapper

from functions import *  
import pickle 

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore") 

import pandas as pd
import numpy as np
# import IPython
# from IPython import display
import sklearn

In [289]:
#load dataframe & drop columns & set dtypes 
df = pd.read_pickle("./original.pkl") 
df = set_dtypes(df) 
df = drop_columns(df) 

In [290]:
#Column processing 
df = transform_dataframe_categorical(df)  
df = transform_dataframe_continous(df)

In [291]:
#identify target and features
target = df['status_group'] 
features = df.drop('status_group', axis=1)  

In [292]:
#dummy features 
data = pd.get_dummies(features)  

In [293]:
#reate validation group 
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = 0.25, random_state=42)

In [294]:
print('Training Features Shape:', data_train.shape)
print('Training Labels Shape:', target_train.shape)
print('Testing Features Shape:', data_test.shape)
print('Testing Labels Shape:', target_test.shape) 

Training Features Shape: (44550, 23561)
Training Labels Shape: (44550,)
Testing Features Shape: (14850, 23561)
Testing Labels Shape: (14850,)


## 8.2 Single Tree Custom Processed Data 
The purpose of this model is to build a single baseline tree to compare the the unprocessed data score

In [295]:
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=5) 
tree_clf.fit(data_train, target_train)

DecisionTreeClassifier(max_depth=5)

In [296]:
pred = tree_clf.predict(data_test)

# Confusion matrix and classification report
print(confusion_matrix(target_test, pred))
print(classification_report(target_test, pred))

[[7292   49  757]
 [ 835   62  177]
 [2607   17 3054]]
                         precision    recall  f1-score   support

             functional       0.68      0.90      0.77      8098
functional needs repair       0.48      0.06      0.10      1074
         non functional       0.77      0.54      0.63      5678

               accuracy                           0.70     14850
              macro avg       0.64      0.50      0.50     14850
           weighted avg       0.70      0.70      0.67     14850



In [297]:
print("Testing Accuracy for Decision Tree Classifier: {:.4}%".format(accuracy_score(target_test, pred) * 100))

Testing Accuracy for Decision Tree Classifier: 70.09%


This model prefored %0.77 worse that the processed data. 

## 8.3 Random Forest Model Custom Processed Data
The purpose of this model is to begin to generate diversity in the model and see if there is any improvement in the model. 

In [301]:
forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest.fit(data_train, target_train)

RandomForestClassifier(max_depth=5)

In [303]:
pred = forest.predict(data_test)

# Confusion matrix and classification report
print(confusion_matrix(target_test, pred))
print(classification_report(target_test, pred))

[[8098    0    0]
 [1074    0    0]
 [5522    0  156]]
                         precision    recall  f1-score   support

             functional       0.55      1.00      0.71      8098
functional needs repair       0.00      0.00      0.00      1074
         non functional       1.00      0.03      0.05      5678

               accuracy                           0.56     14850
              macro avg       0.52      0.34      0.25     14850
           weighted avg       0.68      0.56      0.41     14850



In [304]:
print("Testing Accuracy for Random Forest Classifier: {:.4}%".format(accuracy_score(target_test, pred) * 100))

Testing Accuracy for Random Forest Classifier: 55.58%


Wow, my model is preforming signifigantly worse with custom process. 