### Imports

In [48]:
import matplotlib.pyplot as plt
import math
import pandas_datareader as web
import pandas as pd
import numpy as np
import seaborn as sns
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from sklearn import metrics


%matplotlib inline

### Read in data

In [2]:
pwd

'/Users/hughkohl/Desktop/data-science-projects/Data-Science-Projects/ds_projects-master/QBR_analysis'

In [9]:
df_ent = pd.read_excel('../../../../SnS_QBR_data/Test_Public_List__Renewable_Entitlements__97373__02-07-2020.xlsx', header=9)

In [9]:
df_trans = pd.read_excel('../../../../SnS_QBR_data/Test_Public_List__Transaction_History__97373__02-07-2020.xlsx', sheet_name="Sales Order History", header=1)

### Data cleaning

In [10]:
df_trans.columns = [column.replace(' ', '_') for column in df_trans.columns]

In [16]:
df_trans.groupby(["Original_site_number","Original_part_number"]).sum().reset_index()[['Original_site_number', 'Original_part_number','Billed_quantity', 'Line_item', 'USD_extended_price' ]]

Unnamed: 0,Original_site_number,Original_part_number,Billed_quantity,Line_item,USD_extended_price
0,3000988,D039ULL,1,30,0.00
1,3000988,D0J2FLL,1,10,2170.00
2,3000988,D0J38LL,3,50,3240.00
3,3000988,E04AVLL,2,20,5280.00
4,3000988,E0BXILL,4,120,1873.00
...,...,...,...,...,...
2055,7937174,E0AUULL,0,700,61064.73
2056,7937174,E0AVYLL,0,500,25884.16
2057,7937174,E0AVZLL,0,260,1869.36
2058,7948723,D0GY8LL,56,20,0.00


In [18]:
pd.set_option('display.max_columns', None)

### Feature Engineering & Transformation

In [20]:
df_trans['renewed'] = df_trans.End_date - df_trans.Start_date

In [46]:
df_trans['renew_time'] = df_trans['renew_time'].dt.days

In [44]:
df_trans.loc[df_trans['renew_time'].dt.days > 300, 'renewed'] = 1
df_trans.loc[df_trans['renew_time'].dt.days <= 300, 'renewed'] = 0

In [51]:
df_prods = pd.get_dummies(df_trans['Original_part_number'], prefix_sep='_', drop_first=True)

In [65]:
df_var = df_trans[['renew_time', 'Billed_quantity', 'Line_item', 'USD_extended_price','renewed']]

In [66]:
df = pd.concat([df_prods, df_var], axis=1)

In [74]:
df.drop('renew_time', axis=1, inplace=True)

### Train Test split and model instatiation

In [75]:
X = df.iloc[:,:-1]
y = df['renewed']

In [76]:
# convert selected features do dummies
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# instant
log_mod = LogisticRegression()
knn_mod = KNeighborsClassifier()

#fit
knn_mod.fit(X_train, y_train)
log_mod.fit(X_train,y_train)

#preds logistic
y_preds = log_mod.predict(X_test)
y_probs = log_mod.predict_proba(X_test)

#preds knn
y_preds_knn = knn_mod.predict(X_test)
y_probs_knn = knn_mod.predict_proba(X_test)

In [77]:
y_preds_knn

array([1, 1, 1, ..., 1, 1, 1])

### Model Testing

In [78]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV


dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=1000)

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

preds_dt = dt.predict(X_test)
preds_rf = rf.predict(X_test)

print("Logistic Regression Results")
print(metrics.confusion_matrix(y_test, y_preds))
print(metrics.precision_score(y_test, y_preds) , metrics.recall_score(y_test, y_preds))

print("Decision Tree Results")
print(metrics.confusion_matrix(y_test, preds_dt))
print(metrics.precision_score(y_test, preds_dt) , metrics.recall_score(y_test, y_preds))

print("Random Forest Results")
print(metrics.confusion_matrix(y_test, preds_rf))
print(metrics.precision_score(y_test, preds_rf) , metrics.recall_score(y_test, y_preds))

Logistic Regression Results
[[   1  607]
 [  10 2883]]
0.8260744985673353 0.9965433805737989
Decision Tree Results
[[ 363  245]
 [ 168 2725]]
0.9175084175084175 0.9965433805737989
Random Forest Results
[[ 297  311]
 [  85 2808]]
0.9002885540237255 0.9965433805737989


In [79]:
gb = GradientBoostingClassifier()

parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.1, 1],
    "max_depth":[3, 8 ,10],
    "max_features":["sqrt"],
    "n_estimators":[10]
    }

clf = GridSearchCV(gb, parameters, cv=3, n_jobs=-1)
clf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.025, 0.1, 1],
                         'loss': ['deviance'], 'max_depth': [3, 8, 10],
                         'max_features': ['sqrt'], 'n_estimators': [10]})

In [80]:
gb_preds = clf.best_estimator_.predict(X_test)

In [81]:
print("Gradient Boosting Results")
print(metrics.confusion_matrix(y_test, gb_preds))
print(metrics.precision_score(y_test, gb_preds) , metrics.recall_score(y_test, gb_preds))

Gradient Boosting Results
[[ 149  459]
 [  92 2801]]
0.8592024539877301 0.9681991012789491
