In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tseries.offsets import DateOffset
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, roc_curve
from sklearn.model_selection import train_test_split
%matplotlib inline

# Logistic Regression Model

List out relevant features that may contain some signal. 

In [None]:
features = ["avg_dist",
       "avg_surge",
       "surge_pct", "trips_in_first_30_days", "luxury_car_user",
       "weekday_pct", "city__King's Landing", "city__Winterfell",
       "phone__iPhone", "phone__other", "avg_rating_of_driver__low",
       "avg_rating_by_driver__low", "time_as_user"]

In [None]:
result_df[features].info()

Create a simple logistic regression model 

In [None]:
y = result_df.pop("label")
X = result_df[features]

In [None]:
logit = sm.Logit(y, X)

In [None]:
result = logit.fit()

In [None]:
result.summary()

Create a 70% Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)

Run the model

In [None]:
logit = sm.Logit(y_train, X_train)

In [None]:
result = logit.fit()

In [None]:
result.summary()

Try again with a subset of features

In [None]:
featuresIwant = ["avg_dist", "weekday_pct", "avg_rating_by_driver__low", "surge_pct", "trips_in_first_30_days"]

In [None]:
X_train_new = X_train[featuresIwant]

In [None]:
logit_new = sm.Logit(y_train, X_train_new)

In [None]:
result_new = logit_new.fit()

In [None]:
result_new.summary()

In [None]:
y_pred = result.predict(X_test)
y_pred_new = result_new.predict(X_test[featuresIwant])

# Generate ROC Curve


Create an ROC Curve using the models above, for purposes of comparison

In [None]:
fpr, tpr, threshold = roc_curve(y_test, y_pred, pos_label=None, sample_weight=None, drop_intermediate=True)
fpr1, tpr1, threshold1 = roc_curve(y_test, y_pred_new, pos_label=None, sample_weight=None, drop_intermediate=True)

In [None]:
plt.plot(fpr, tpr, label = "all_features")
plt.plot(fpr1, tpr1, label = "5 features")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend()

In [None]:
df.head()

In [None]:
df_2 = df.copy()

In [None]:
for i in categories:
    x = pd.get_dummies(df_2[i], prefix = str(i)+"_")
    df_2 = pd.concat([df_2, x],axis=1)

# Random Forest Classifier Model

Prepare data for Random Forest Classifier

In [None]:
rf_df = df_2.copy()

In [None]:
rf_df.columns

In [None]:
features = [u'avg_dist',
       u'avg_surge',
       u'surge_pct', u'trips_in_first_30_days', u'luxury_car_user',
       u'weekday_pct', u'city__Astapor', "city__King's Landing",
       u'city__Winterfell', u'phone__Android', u'phone__iPhone',
       u'phone__other', u'avg_rating_of_driver__high',
       u'avg_rating_of_driver__low', u'avg_rating_by_driver__high',
       u'avg_rating_by_driver__low']

In [None]:
rf_df[features].info()

In [None]:
y_rf = rf_df.pop("label").values
X_rf = rf_df[features].values

Generate a train/test split

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X_rf, y_rf, train_size = 0.7)

In [None]:
y_rf.shape

Fit the initial model

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_features=4)

In [None]:
rf.fit(Xtrain, ytrain)

In [None]:
predictions = rf.predict(Xtest)

In [None]:
ytest.shape

Evaluate Feature Importance

In [None]:
imp_features = rf.feature_importances_

In [None]:
importance_list= pd.DataFrame(imp_features, index=features, columns = ["rel_importance"])
#importance_list.rename(columns={0: "relative_importances"})
importance_list.reset_index()

In [None]:
importance_list.sort_values("rel_importance", ascending = False, inplace=True)

In [None]:
importance_list.reset_index()

# Generate ROC Curve 2

Incorporate Random Forest classifier, generating a fresh ROC curve

In [None]:
fpr, tpr, threshold = roc_curve(y_test, y_pred, pos_label=None, sample_weight=None, drop_intermediate=True)
fpr1, tpr1, threshold1 = roc_curve(y_test, y_pred_new, pos_label=None, sample_weight=None, drop_intermediate=True)
fpr2, tpr2, threshold2 = roc_curve(ytest, predictions, pos_label=None, sample_weight=None, drop_intermediate=True)

In [None]:
fpr2.shape

In [None]:
plt.plot(fpr, tpr, label = "logistic_all_features")
plt.plot(fpr1, tpr1, label = "logistic_5 features")
plt.plot(fpr2, tpr2, label = "Random forest")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend()

In [None]:
ytest.shape