# Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import RFECV
import pickle

In [None]:
# define function to pick model after it is fit
def pickle_model(model_name, model):
    model_pickle_path = './models/{}.pkl'.format(model_name)
    model_pickle = open(model_pickle_path, 'wb')
    pickle.dump(model, model_pickle)
    model_pickle.close()

# Import Data

In [None]:
# import dataset into pandas and assign to variable
data = pd.read_pickle('./cleaned_data/dummies_user_df.pkl')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.describe()

# Data Preparation

In [None]:
# identify features
features = data.drop('user_id', axis=1).copy()

In [None]:
# identify X and y
X = features.drop('target', axis=1)
y = features.target

In [None]:
# create training and testing datasets with 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)

In [None]:
print('X_train', len(X_train))
print('X_test', len(X_test))
print('y_train', len(y_train))
print('y_test', len(y_test))

## Class Imbalance

In [None]:
# Concat X_train, y_train together into a training dataset
data_train = pd.concat([X_train, y_train], axis=1)

In [None]:
# Separate minority and majority classes
happy = data_train[data_train.target == 0]
sad = data_train[data_train.target == 1]

In [None]:
print('# of Happy Users: '+ str(len(happy)))
print('# of Sad Users: '+ str(len(sad)))

In [None]:
# visualize class balance
plt.figure(figsize=(10,8))
sns.set_style('whitegrid')
sns.countplot(x='target',data=data_train, palette='GnBu_d')

In [None]:
# Undersample majority
sad_downsampled = resample(happy,
                                replace = False, # sample without replacement
                                n_samples = len(sad), # match minority n
                                random_state = 23) # reproducible results

In [None]:
# combine minority and downsampled majority
downsampled = pd.concat([sad, sad_downsampled])

In [None]:
# check counts
downsampled.target.value_counts()

In [None]:
# redefine X_train, y_train
X_train = downsampled.drop('target', axis=1)
y_train = downsampled.target

In [None]:
print('X_train', len(X_train))
print('X_test', len(X_test))
print('y_train', len(y_train))
print('y_test', len(y_test))

## Scaling the Data

In [None]:
# instantiate the scaler, fit to train data and transform it
scaler = StandardScaler()
scaled_train = scaler.fit_transform(X_train)

In [None]:
#Use the scaler that is fit to the training data to transform the testing data
scaled_test = scaler.transform(X_test)

In [None]:
# turn scaled training dataset to pandas dataframe
scaled_train = pd.DataFrame(scaled_train, columns=X_train.columns)
scaled_train.head()

In [None]:
scaled_train.shape

In [None]:
# turn scaled testing dataset to pandas dataframe
scaled_test = pd.DataFrame(scaled_test, columns=X_test.columns)
scaled_test.head()

In [None]:
scaled_test.shape

# Base Models

## Naive Bayes

In [None]:
# create a Naive Bayes instance and assign to variable
clf = GaussianNB()

In [None]:
# fit Naive Bayes instance to training data and assign to variable
nb_base = clf.fit(scaled_train, y_train)

In [None]:
nb_base

In [None]:
# create a classification report for the model on training data
nb_preds_train = nb_base.predict(scaled_train)
print(classification_report(y_train, nb_preds_train))

In [None]:
#confusion matrix visualization of train data
cm_nb_train = confusion_matrix(y_train, nb_preds_train)

plt.figure(figsize = (9, 9))
sns.heatmap(cm_nb_train, annot = True, fmt = ".3f", linewidths = .5, square = True, cmap = plt.cm.Blues)

In [None]:
# create a classification report for the model on testing data
nb_preds_test = nb_base.predict(scaled_test)
print(classification_report(y_test, nb_preds_test))

In [None]:
#confusion matrix visualization of train data
cm_nb_test = confusion_matrix(y_test, nb_preds_test)

plt.figure(figsize = (9, 9))
sns.heatmap(cm_nb_test, annot = True, fmt = ".3f", linewidths = .5, square = True, cmap = plt.cm.Blues)

In [None]:
# # pickle base model
# pickle_model(nb_base, nb_base)

## Support Vector Machine

In [None]:
# create a Support Vector Machine instance for linear SVM
svm = SVC(kernel='linear')

In [None]:
# fit SVM to training data
svm_base = svm.fit(scaled_train, y_train)

In [None]:
# create a classification report for the model on training data
svm_preds_train = svm_base.predict(scaled_train)
print(classification_report(y_train, svm_preds_train))

In [None]:
#confusion matrix visualization of train data
cm_svm_train = confusion_matrix(y_train, svm_preds_train)

plt.figure(figsize = (9, 9))
sns.heatmap(cm_svm_train, annot = True, fmt = ".3f", linewidths = .5, square = True, cmap = plt.cm.Blues)

In [None]:
# create a classification report for the model on testing data
svm_preds_test = svm_base.predict(scaled_test)
print(classification_report(y_test, svm_preds_test))

In [None]:
#confusion matrix visualization of train data
cm_svm_test = confusion_matrix(y_test, svm_preds_test)

plt.figure(figsize = (9, 9))
sns.heatmap(cm_svm_test, annot = True, fmt = ".3f", linewidths = .5, square = True, cmap = plt.cm.Blues)

In [None]:
# # pickle base model
# pickle_model(svm_base, svm_base)

# Feature Selection

## Recursive Feature Elimination with Cross Validation

In [None]:
# create support vector machine classifier object to use as estimator for recursive feature elimination
svc = SVC(kernel='linear')

In [None]:
# create recursive feature elimination object to use for complete feature ranking on data using SVC as estimator and StratifiedKFold to create cross_validation folds
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')

In [None]:
# fit RFE object to training data
rfecv.fit(scaled_train, y_train)

In [None]:
print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

# Randomized Search

# Final Models