## Import Packages
Test features

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
%matplotlib inline


## Read in Data

In [5]:
resume_train = pd.read_csv('../output/train_features.csv')
resume_val = pd.read_csv('../output/val_features.csv')
resume_test = pd.read_csv('../output/test_features.csv')

resume_train.head()

Unnamed: 0,line_length,word_count,verb_percentage,adj_percentage,stopword_percentage,punctuation_percentage,number_percentage,proper_noun_percentage,line_length_trans,verb_percentage_trans,stopword_percentage_trans,punctuation_percentage_trans,word_count_trans,adj_percentage_trans,number_percentage_trans,proper_noun_percentage_trans
0,9,1,0.0,0.0,0.0,0.0,0.0,0.0,1.245731,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,71,15,6.2,0.0,5.6,1.4,6.2,25.0,1.531531,2.48998,2.366432,1.183216,1.03422,0.0,1.837091,5.0
2,5,2,0.0,0.0,20.0,0.0,0.0,66.7,1.174619,0.0,4.472136,0.0,0.0,0.0,0.0,8.167007
3,36,4,0.0,50.0,0.0,0.0,0.0,0.0,1.430969,0.0,0.0,0.0,0.0,3.684031,0.0,0.0
4,15,1,0.0,100.0,0.0,6.7,0.0,0.0,1.311019,0.0,0.0,2.588436,1.209504,4.641589,0.0,0.0


In [6]:
# Define a list of features to be used for each dataset
raw_features = ['line_length', 'verb_percentage','adj_percentage','stopword_percentage','punctuation_percentage','number_percentage']

transformed_features = ['line_length_trans', 'stopword_percentage_trans', 'punctuation_percentage_trans']

reduced_features = ['line_length_trans','verb_percentage','stopword_percentage_trans']

In [None]:
# Write out all data
resume_train[raw_features].to_csv('../output/train_features_raw.csv',index=False)
resume_val[raw_features].to_csv('../output/val_features_raw.csv',index=False)
resume_test[raw_features].to_csv('../output/test_features_raw.csv',index=False)

resume_train[transformed_features].to_csv('../output/train_features_trans.csv',index=False)
resume_val[transformed_features].to_csv('../output/val_features_trans.csv',index=False)
resume_test[transformed_features].to_csv('../output/test_features_trans.csv',index=False)

resume_train[all_features].to_csv('../output/train_features_all.csv',index=False)
resume_val[all_features].to_csv('../output/val_features_all.csv',index=False)
resume_test[all_features].to_csv('../output/test_features_all.csv',index=False)

resume_train[reduced_features].to_csv('../output/train_features_reduced.csv',index=False)
resume_val[reduced_features].to_csv('../output/val_features_reduced.csv',index=False)
resume_test[reduced_features].to_csv('../output/test_features_reduced.csv',index=False)

In [None]:
train_features = pd.read_csv('output/train_features_raw.csv')
train_labels = pd.read_csv('output/train_labels.csv')

train_labels.head()

In [None]:
# Generate correlation matrix heat map
matrix = np.triu(train_features.corr())
sns.heatmap(train_features.corr(), annot=True, fmt='.1f', vmin=-1, vmax=1, center=0, cmap='coolwarm', mask=matrix)

## GridSearchCV

This is LOGISTIC REGRESSION

In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}.'.format(round(mean, 3), round(std*2, 3), params))

In [None]:
 # GridSearch
# Instantiate a RandomForestClassifier
rf = RandomForestClassifier()
# Create a dictionary with the parameters to check
parameters ={
    'n_estimators':[2**i for i in range(3,10)],
    'max_depth':[2, 4, 8, 16, 32, None]
}
# Instantiate a GridSearchCV object, passing the RandomForestClassifier, paramaters,
# and number of "K-folds" (we are using 5)
cv = GridSearchCV(rf, parameters, cv=5)

# Like all scikit-learn objects, cv must be fit. Input values must be arrays
cv.fit(train_features, train_labels.values.ravel())

# This will give us the best hyperparameter settings given this set of data
print_results(cv)

### Feature Importance

In [None]:
# Generate feature importance plot
feat_imp = cv.best_estimator_.feature_importances_
indices = np.argsort(feat_imp)
plt.yticks(range(len(indices)), [train_features.columns[i] for i in indices])
plt.barh(range(len(indices)), feat_imp[indices], color='r', align='center')
plt.show()

### Write Out Pickled Model 

In [None]:
# GridSearchCV automatically makes a best_estimator_ atrribute on a riffiting of the model on 100% of the data
joblib.dump(cv.best_estimator_, 'models/mdl_raw_original_features.pkl')

#### Repeat process for other models


In [None]:
# Transformed Features
train_features = pd.read_csv('output/train_features_trans.csv')
train_labels = pd.read_csv('output/train_labels.csv')

train_features.head()

In [None]:
# Generate correlation matrix heat map
matrix = np.triu(train_features.corr())
sns.heatmap(train_features.corr(), annot=True, fmt='.1f', vmin=-1, vmax=1, center=0, cmap='coolwarm', mask=matrix)

In [None]:
 # GridSearch
rf = RandomForestClassifier()
parameters ={
    'n_estimators':[2**i for i in range(3,10)],
    'max_depth':[2, 4, 8, 16, 32, None]
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

In [None]:
# Generate feature importance plot
feat_imp = cv.best_estimator_.feature_importances_
indices = np.argsort(feat_imp)
plt.yticks(range(len(indices)), [train_features.columns[i] for i in indices])
plt.barh(range(len(indices)), feat_imp[indices], color='r', align='center')
plt.show()

In [None]:
# GridSearchCV automatically makes a best_estimator_ atrribute on a riffiting of the model on 100% of the data
joblib.dump(cv.best_estimator_, 'models/mdl_transformed_features.pkl')

In [None]:
# All Features
train_features = pd.read_csv('output/train_features_all.csv')
train_labels = pd.read_csv('output/train_labels.csv')

train_features.head()

In [None]:
# Generate correlation matrix heat map
matrix = np.triu(train_features.corr())
sns.heatmap(train_features.corr(), annot=True, fmt='.1f', vmin=-1, vmax=1, center=0, cmap='coolwarm', mask=matrix)

In [None]:
 # GridSearch
rf = RandomForestClassifier()
parameters ={
    'n_estimators':[2**i for i in range(3,10)],
    'max_depth':[2, 4, 8, 16, 32, None]
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

In [None]:
# Generate feature importance plot
feat_imp = cv.best_estimator_.feature_importances_
indices = np.argsort(feat_imp)
plt.yticks(range(len(indices)), [train_features.columns[i] for i in indices])
plt.barh(range(len(indices)), feat_imp[indices], color='r', align='center')
plt.show()

In [None]:
# GridSearchCV automatically makes a best_estimator_ atrribute on a riffiting of the model on 100% of the data
joblib.dump(cv.best_estimator_, 'models/mdl_all_features.pkl')

In [None]:
# Reduced Features
train_features = pd.read_csv('output/train_features_reduced.csv')
train_labels = pd.read_csv('output/train_labels.csv')

train_features.head()

In [None]:
# Generate correlation matrix heat map
matrix = np.triu(train_features.corr())
sns.heatmap(train_features.corr(), annot=True, fmt='.1f', vmin=-1, vmax=1, center=0, cmap='coolwarm', mask=matrix)

In [None]:
 # GridSearch
rf = RandomForestClassifier()
parameters ={
    'n_estimators':[2**i for i in range(3,10)],
    'max_depth':[2, 4, 8, 16, 32, None]
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

In [None]:
# Generate feature importance plot
feat_imp = cv.best_estimator_.feature_importances_
indices = np.argsort(feat_imp)
plt.yticks(range(len(indices)), [train_features.columns[i] for i in indices])
plt.barh(range(len(indices)), feat_imp[indices], color='r', align='center')
plt.show()

In [None]:
# GridSearchCV automatically makes a best_estimator_ atrribute on a riffiting of the model on 100% of the data
joblib.dump(cv.best_estimator_, 'models/mdl_reduced_features.pkl')