# x - Explore and Model

### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)

import os
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import importlib

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

### Import Data

In [3]:
# setting to view all columns
pd.set_option('display.max_columns', 999)

# opening saved data from pickle file
# The protocol version used is detected automatically, so we do not
# have to specify it.
with open('data/df-os.pickle', 'rb') as f:
    df_2 = pickle.load(f)

display(df_2.shape)
df_2.head()

(39644, 61)

Unnamed: 0_level_0,timedelta,n_title,n_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,nhrefs,nself_hrefs,nimgs,nvideos,avg_token_length,nkeywords,channel_lifestyle,channel_ent,channel_bus,channel_socmed,channel_tech,channel_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_ref_min_shares,self_ref_max_shares,self_ref_avg_shares,week_mon,week_tues,week_wednes,week_thurs,week_fri,week_satur,week_sun,weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subj,global_sentiment_pol,global_rate_pos_words,global_rate_neg_words,rate_pos_words,rate_neg_words,avg_pos_pol,min_pos_pol,max_pos_pol,avg_neg_pol,min_neg_pol,max_neg_pol,title_subj,title_sentiment_pol,abs_title_subj,abs_title_sentiment_pol,shares,Shares_plus
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
http://mashable.com/2013/01/07/amazon-instant-video-browser/,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,4.680365,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,496.0,496.0,496.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593,0
http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,0.0,4.913725,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.799756,0.050047,0.050096,0.050101,0.050001,0.341246,0.148948,0.043137,0.015686,0.733333,0.266667,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711,0
http://mashable.com/2013/01/07/apple-40-billion-app-downloads/,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,0.0,4.393365,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,918.0,918.0,918.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217792,0.033334,0.033351,0.033334,0.682188,0.702222,0.323333,0.056872,0.009479,0.857143,0.142857,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500,1
http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,0.0,4.404896,7.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028573,0.4193,0.494651,0.028905,0.028572,0.42985,0.100705,0.041431,0.020716,0.666667,0.333333,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200,0
http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,0.0,4.682836,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,545.0,16000.0,3151.157895,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028633,0.028794,0.028575,0.028572,0.885427,0.513502,0.281003,0.074627,0.012127,0.860215,0.139785,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505,0


> __To proceed__:
* 
* 
* 
* 
* 
* 



#### Import Required Libraries

#### Split Outcome Variables and Target

In [4]:
# spliting the dataframe outcome and predictor variables
X =  # features
y =  # target

SyntaxError: invalid syntax (<ipython-input-4-ca0c6778f30e>, line 2)

#### Buid Pipeline

In [None]:
# displaying the model's test set accuracy
print("\nDecision Tree Classifier Testing Accuracy: {:.4}%\n".format(
    accuracy_score(y_test, pred) * 100))

In [None]:
# plotting confusion matrix heatmap
cm = confusion_matrix(y_test, pred)

sns.heatmap(cm, cmap=sns.color_palette('Blues'), annot=True, fmt='0.4g')

plt.autoscale()

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")

plt.show()

<table>
  <tr>
    <td>True Negative</td>
    <td>False Positive</td>
  </tr>
  <tr>
    <td>False Negative</td>
    <td>True Positive</td>
  </tr>
</table>

In [None]:
# importing necessary library
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.filterwarnings("ignore")

mean_forest_cv_score = np.mean(cross_val_score(tree_clf, 
                                           X_train, 
                                           y_train, 
                                           cv=10))
print(f"\nMean Cross Validation Score for Decision Tree Classifier: {mean_forest_cv_score :.2%}\n")

## Bootstrap Aggregation (Bagging)

Next, we will try a bagged trees ensemble model. Considering the low variance in our original Tree model, we are not likely to be surprised by the bagging scores.

In [None]:
# instantiating a Bagging classifier
bagged_tree_clf = BaggingClassifier(
    DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=0
    ),
    n_estimators=20,
    random_state=0
)

In [None]:
# fit to the training data
bagged_tree_clf.fit(X_train, y_train)

In [None]:
# checking the accuracy of our Training set
print("\nTraining set score: ", 
      bagged_tree_clf.score(X_train, 
                        y_train).round(2))

# checking the accuracy of our Test set
print("Test set score:     ", 
      bagged_tree_clf.score(X_test, 
                        y_test).round(2))

> True enough, our bagging scores are in line with scores from our original Tree model.
* Note: when tested with a max depth of 7, the model scores the training set at 0.69 and the test set at 0.65, indicating that the model quicky begins to overfit as we increase depth.

## Random Forest

Another ensemble model is Random Forest. This model allows for lower correlation among samples, by limiting the number of features on which to split.

In [None]:
# instantiating a Random Forest classifier
forest_clf = RandomForestClassifier(n_estimators=100, 
                                max_depth=5, 
                                random_state = 0)

# fit to training data
forest_clf.fit(X_train, y_train)

In [None]:
# checking the Training accuracy of our forest
print("\nTraining set score: ", forest_clf.score(X_train, y_train).round(2))

# checking the Test accuracy of our forest
print("Test set score:     ", forest_clf.score(X_test, y_test).round(2))

> Our test score nuged upwards, a tiny bit.
* So far, results have not strayed far from our our first - guess tree. 

In [None]:
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.plotting import plot_decision_regions

# Initializing Classifiers
clf1 = bagged_tree_clf
clf2 = forest_clf
clf3 = SVC(random_state=0, probability=True)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[2, 1, 1], voting='soft')

# Plotting Decision Regions
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(13, 8))

for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
                         ['Bag of Trees', 'Random Forest', 'RBF kernel SVM', 'Ensemble'],
                         itertools.product([0, 1], repeat=2)):
    clf.fit(X, y)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X_feat, y.astype(np.integer).values, clf=clf, legend=2)
    plt.title(lab)
    plt.show()

> While scores have remained close among our models, we can observe from the plot above that feature importances have shifted.
* The random forest classifier suggests about 10 relatively important features--a couple more than suggested by the Decision Tree model.
* Where `kw_avg_avg` reached 0.33 and `kw_avg_max` eeked - out only 0.02, in our Tree model, the Random Forest model improtances for the same two features are approximately 0.11 and 0.13, respectively.

Even if accuracy scores were exactly the same, the more balanced RandomForest model would be preferred to the lucky - guess of our original Decision Tree model.

## GridSearch

Rather than settling for a single set of parameters, we can create a parameter grid dictionary that enables us to test different values for each parameter.

In [None]:
# creating a parameter grid dictionary
rf_param_grid = {
    'n_estimators': [10, 30, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 6]
}

We can perform a Grid Search to determine an optimal parameter combination from among the values in our grid dictionary.

In [None]:
# performing a grid search with Random Forest
rf_grid_search = GridSearchCV(forest_clf, 
                              rf_param_grid, 
                              cv=3)
rf_grid_search.fit(X_train, y_train)
print(f"Testing Accuracy: {rf_grid_search.best_score_ :.2%}")
print("")
print(f"Optimal Parameters: {rf_grid_search.best_params_}")

> GridSearch determined that our model could achieve its most accurate score, by changing `max_depth` from 5 to 'None'.
    * Optimal criterion was changed from 'gini' to 'entropy', and `n_estimators` were kept at the original setting of 100.
    * The grid search increased `min_samples_leaf` from 1 to 6, and `min_samples_split` was found to be more effective at 5, rather than our original value of 1.

### Other Algorithms

> In addition to the models we have tested, Boosting models (including __AdaBoost__, __Gradient Boost__, and __XGBoost__) offer additional tuning options. 
* We can create a pipeline that will enable us to plug - in various models.
* By creating a scaled pipeline we can incorporate and compare additional models such as K-Nearest Neighbors(KNN).

## Save and Continue



In [None]:
# importing custom helper functions
from helpers.helper import save_model, save_models

In [None]:
# def save_model(model, directory='./models'):
#     """
#     creates a file name by appending .pickle' to a model's variable name,
#     and saves the model as a pickle file in the working directory's
#     'model' subdirectory
#     """
#     # verify or create the save - path directory
#     if not os.path.exists(directory):
#         os.mkdir(directory)

#     # build the file name
#     filename = [tuple[0] for tuple in filter(
#         lambda x: model is x[1],
#         globals().items())
#              ][0]


#     # pickle the model with the created filename
#     with open(f'{directory}/{filename}.pickle', 'wb') as f:
#         # pickling the dataframe using the highest protocol available
#         pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
#     return print(f'Saved {filename} to {directory} as pickle file: {filename}\n', '-'*50)


# def save_models(models):
#     """
#     passes a list of model names and saves
#     the models as separate pickle files
#     """
#     for m in models:
#         saved_model = save_model(m)
#     return saved_model


### Pickle the models

In [None]:
# list models from the current notebook
notebook_models = [tree_clf, 
                  bagged_tree,
                  forest_clf,
                  rf_grid_search]

In [None]:
from yellowbrick.classifier import ClassificationReport

In [None]:
for mod in notebook_models:
    # Instantiate the classification model and visualizer
    visualizer = ClassificationReport(mod, classes=['Above','Below'])
    visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
    visualizer.score(X_test, y_test) # Evaluate the model on the test data
    g = visualizer.poof() # Draw/show/poof the data

In [None]:
# pass model list to the custom `save_models` function
save_models(notebook_models)

In [None]:
# 72 Char. screen - width reference
########################################################################

In [None]:
with open('data/#.pickle', 'wb') as f:
    # pickling the dataframe using the highest protocol available
    pickle.dump(#, f, pickle.HIGHEST_PROTOCOL)