In [None]:
import os

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

%run model.py
%run betweenCompare.py

%matplotlib inline

np.random.seed(1001)

# Functions 

# Load Data

In [None]:
# Load transformed data
cwd = os.getcwd()
datadir = cwd + os.sep + 'data' + os.sep

data = pd.read_csv(datadir + "complete_data.csv", index_col = 0)

In [None]:
data.head()

# Feature Selection

In [None]:
# Train test split
X_train, X_test, y_train, y_test = ms.train_test_split(data.drop('label', 1), data.label, test_size=0.2, random_state=1001)

In [None]:
# DecisionTree for MI scores
dt = Model("Decision Tree", DecisionTreeClassifier(criterion='entropy'), X_train, y_train, X_test, y_test)

In [None]:
# Get importance and correlation
features_summary = pd.DataFrame(list(zip(X_train.columns, dt.models["Baseline"].model.feature_importances_)), 
                                columns=['feature','importance']).set_index('feature')
features_summary = features_summary.sort_values('importance', ascending=False)

corr_df = pd.DataFrame(data.corr()['label'][:-1])
corr_df.columns = ['correlation']

features_summary = features_summary.merge(corr_df, right_index=True, left_index=True)

# Plot importance and correlation
color_list = ['r' if corr < 0 else 'g' for corr in features_summary.correlation]
features_summary.importance.plot(kind='bar', color=color_list, figsize=(12,8))
plt.title('Feature Importance and Correlation Direction')
plt.ylabel('Importance')

# Select features
keep_features = features_summary[features_summary.importance > 0].index.values

# Train Baseline Model

In [None]:
# Discard features
X_train_filt = X_train[keep_features]
X_test_filt = X_test[keep_features]
print(keep_features)

In [None]:
# Baseline Logistic Regression and SVM
lr = Model("Logistic Regression", LogisticRegression(C=1e30), X_train, y_train, X_test, y_test)
svm = Model("SVM", SVC(kernel="linear"), X_train, y_train, X_test, y_test)
knn = Model("KNN", KNeighborsClassifier(35), X_train, y_train, X_test, y_test)

svm_pipeline = Pipeline(steps = [('normalize', Normalizer()),
                                 ('estimator', svm.model)])

knn_pipeline = Pipeline(steps = [('normalize', Normalizer()),
                                 ('estimator', knn.model)])

kfold = KFold(10, True)
lr_cv = cross_val_score(lr.model, X_train, y_train, cv = kfold, scoring="roc_auc")
svm_cv = cross_val_score(svm_pipeline, X_train, y_train, cv = kfold, scoring="roc_auc")
knn_cv = cross_val_score(knn_pipeline, X_train, y_train, cv = kfold, scoring="roc_auc")

In [None]:
print("LR Mean CV AUC Score: {:0.3}".format(np.mean(lr_cv))+
      "\nLR StdErr CV AUC Score: {:0.3}".format(np.sqrt(np.var(lr_cv)/len(lr_cv))))

print("\nSVM Mean CV AUC Score: {:0.3}".format(np.mean(svm_cv))+
      "\nSVM StdErr CV AUC Score: {:0.3}".format(np.sqrt(np.var(svm_cv)/len(svm_cv))))

print("\nKNN Mean CV AUC Score: {:0.3}".format(np.mean(knn_cv))+
      "\nKNN StdErr CV AUC Score: {:0.3}".format(np.sqrt(np.var(knn_cv)/len(knn_cv))))

In [None]:
# ROC Curve for single test split baseline models
lr.fit(X_train_filt, Y_train)
svm_pipeline.fit(X_train_filt, Y_train)
knn_pipeline.fit(X_train_filt, Y_train)

lr_pos_class = lr.classes_==1
preds_lr = lr.predict_proba(X_test_filt)[:,lr_pos_class]
preds_svm = svm_pipeline.decision_function(X_test_filt)
preds_knn = knn_pipeline.predict_proba(X_test_filt)[:,lr_pos_class]
preds_zip = zip([preds_lr, preds_svm, preds_knn], ["LogisticRegression", "SVM", "KNN"])

fig, axes = plt.subplots(1,1, figsize=(8,6))
for each_preds, each_model in preds_zip:
    fpr, tpr, thresholds = roc_curve(Y_test, each_preds)
    roc_auc = auc(fpr, tpr)
    axes.plot(fpr, tpr, label = each_model+" (AUC = {:0.3})".format(roc_auc))

plt.title("ROC Curves for Baseline Models")
plt.xlabel("fpr")
plt.ylabel("tpr")
plt.legend()

# Pipelines

In [None]:
lr_pipeline = Pipeline([('variance_thresh', VarianceThreshold()),
                        ('estimator', LogisticRegression())])

svm_pipeline = Pipeline([('variance_thresh', VarianceThreshold()),
                        ('normalize', Normalizer()),
                        ('estimator', SVC())])

gbm_pipeline = Pipeline([('variance_thresh', VarianceThreshold()),
                        ('estimator', GradientBoostingClassifier())])

# GBM Tuning 

In [None]:
gbc = Model("GBC", GradientBoostingClassifier(), X_train_filt, y_train, X_test_filt, y_test)

 ## Iteration 1

In [None]:
gbc.addIteration("Estimators", GradientBoostingClassifier(),
                {'estimator__n_estimators': list(range(10,500,20))}, plot = True)
gbc.withinCompare()

## Iteration 2 

In [None]:
gbc.addIteration("Max Depth", GradientBoostingClassifier(n_estimators=50),
                {'estimator__max_depth': list(range(1,15))}, plot = True)
gbc.withinCompare()

## Iteration 3

In [None]:
gbc.addIteration("Min Samples", GradientBoostingClassifier(n_estimators=50, max_depth=2),
                {'estimator__min_samples_leaf': list(range(5,500,10))}, plot = True)
gbc.withinCompare()

## Iteration 4

In [None]:
gbc.addIteration("Max Features", GradientBoostingClassifier(n_estimators=50, max_depth=2, min_samples_leaf=215),
                {'estimator__max_features': list(range(2,X_train_filt.shape[1],2))}, plot = True)
gbc.withinCompare()

## Iteration 5

In [None]:
gbc.addIteration("Subsample", 
                GradientBoostingClassifier(n_estimators=50, max_depth=2, min_samples_leaf=215, max_features=14),
                {'estimator__subsample': np.array(list(range(10,105,5)))/100}, plot = True)
gbc.withinCompare()

## Interation 6 

In [None]:
gbc.addIteration("Estimators & Learning Rate",
                GradientBoostingClassifier(n_estimators=50, max_depth=2, min_samples_leaf=215, max_features=14,
                                           subsample=0.85),
                {'estimator__n_estimators': list(range(10,1000,20)),
                 'estimator__learning_rate': [10**x for x in range(-3,0)]}, plot = True)
gbc.withinCompare()

## Iteration 7 

In [None]:
gbc.addIteration("Estimators 2",
                GradientBoostingClassifier(n_estimators=50, max_depth=2, min_samples_leaf=215, max_features=14,
                                           subsample=0.85, learning_rate=0.01),
                {'estimator__n_estimators': list(range(100,3000,100))}, plot = True)
gbc.withinCompare()

# Random Forest



In [None]:
rdf = Model("RDF", RandomForestClassifier(), X_train_filt, y_train, X_test_filt, y_test)

## Iteration 1

In [None]:
rdf.addIteration("Number of Estimators", RandomForestClassifier(),
                {'estimator__n_estimators': list(range(1,500,5))}, plot = True)
rdf.withinCompare()

## Iteration 2

In [None]:
rdf.addIteration("Max Features", RandomForestClassifier(n_estimators=421),
                {'estimator__max_features': list(range(1,40,1))}, plot = True)
rdf.withinCompare()
#{'estimator__min_samples_leaf': list(range(2,20,2))}, plot = True)
#{'estimator__min_samples_split':list(range(2,10,1))}, plot = True)



## Iteration 3

In [None]:
rdf.addIteration("Min Samples Leaf", RandomForestClassifier(n_estimators=421, max_features=7),
                {'estimator__min_samples_leaf': list(range(2,20,2))}, plot = True)
rdf.withinCompare()
#
#{'estimator__min_samples_split':list(range(2,10,1))}, plot = True)


## Iteration 4

In [None]:
rdf.addIteration("Min Samples Split", RandomForestClassifier(n_estimators=421, max_features=7, min_samples_leaf=8),
                {'estimator__min_samples_split':list(range(2,40,2))}, plot = True)
rdf.withinCompare()


# KNN

In [None]:
knn = Model("KNN", KNeighborsClassifier(), X_train_filt, y_train, X_test_filt, y_test)

In [None]:
knn.addIteration("N-Neighbors(U)", KNeighborsClassifier(),
                {'estimator__n_neighbors': list(range(1,50,1))}, plot = True)
knn.withinCompare()

In [None]:
knn.addIteration("N-Neighbors(D)", KNeighborsClassifier(weights = 'distance'),
                {'estimator__n_neighbors': list(range(1,50,1))}, plot = True)
knn.withinCompare()

# Model Comparison


In [None]:
betweenCompare([gbc, rdf, knn])