In [None]:
import os
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, GridSearchCV

# Setup
####################
#os.chdir('C:/Users/brian/Documents/ISYE 7406 Data')
plt.style.use("ggplot")

df = pd.read_csv('journeys.csv')
targets = df['Conversion']
df.drop(['Journey Start Date', 'Journey End Date', 'Events Combo', 'User-Journey'], axis=1, inplace=True)
data_final = pd.get_dummies(df)

X = data_final.loc[:, data_final.columns != 'Conversion']
y = data_final.loc[:, data_final.columns == 'Conversion']

# Implement SMOTE prior to models
########################
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X, os_data_y = os.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
os_data_y = pd.DataFrame(data=os_data_y, columns=['Conversion'])

data_final_vars = data_final.columns.values.tolist()
y = ['Conversion']
X = [i for i in data_final_vars if i not in y]

# RFE
#############
logreg = LogisticRegression(max_iter=5000)
rfe = RFE(logreg, step=20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)
cols = []
for i in [i for i, X in enumerate(list(rfe.support_)) if X]:
    cols.append(X[i])
cols

X = os_data_X[cols]
y = os_data_y['Conversion']

# Logistic
####################
warnings.filterwarnings('ignore')

# Parameter grid
parameters = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-3, 3, 7),
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
}

# Build the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
log_reg = GridSearchCV(logreg, param_grid=parameters, scoring='F1', cv=10)
log_reg.fit(X_train, y_train)

print("Tuned Hyperparameters :", log_reg.best_params_)
print("Accuracy :", log_reg.best_score_)

# Plot
################
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()