In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [38]:
# Bring in modeling data
# df = pd.read_csv('modeling_dfs/users_order_by_prev_dept.csv')
df = pd.read_csv('modeling_dfs/orders_by_top50organic_by_aisle_by_dept.csv')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
display(df.head(5))

# Bring in products table with names
df_products = pd.read_csv('products_organic.csv')
df_products.drop(['Unnamed: 0'],axis=1,inplace=True)
display(df_products.head(5))

# Bring in top 50 organic product names
top50_organic = pd.read_csv('top50_organic_items.csv')
top50_organic.columns = ['product_name','num_items']
display(top50_organic.head(5))

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,organic,organic_produce,any_organic,...,"('department', 'household')","('department', 'international')","('department', 'meat seafood')","('department', 'missing')","('department', 'other')","('department', 'pantry')","('department', 'personal care')","('department', 'pets')","('department', 'produce')","('department', 'snacks')"
0,1187899,1,train,11,4,8,14.0,3.0,0.0,True,...,1,0,0,0,0,0,0,0,0,3
1,1492625,2,train,15,1,11,30.0,6.0,1.0,True,...,0,0,0,0,0,0,1,0,7,5
2,2196797,5,train,5,0,11,6.0,5.0,3.0,True,...,1,2,0,0,0,1,0,0,4,0
3,525192,7,train,21,2,11,6.0,1.0,0.0,True,...,0,0,1,0,0,1,0,0,1,0
4,880375,8,train,4,1,14,10.0,9.0,7.0,True,...,0,0,0,0,0,1,0,0,13,0


Unnamed: 0,product_id,product_name,aisle_id,department_id,organic,aisle,department,organic_produce
0,1,Chocolate Sandwich Cookies,61,19,0,cookies cakes,snacks,0
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,0,cookies cakes,snacks,0
2,102,Danish Butter Cookies,61,19,0,cookies cakes,snacks,0
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,0,cookies cakes,snacks,0
4,285,Mini Nilla Wafers Munch Pack,61,19,0,cookies cakes,snacks,0


Unnamed: 0,product_name,num_items
0,Bag of Organic Bananas,15480
1,Organic Strawberries,10894
2,Organic Baby Spinach,9784
3,Organic Avocado,7409
4,Organic Hass Avocado,7293


In [27]:
%%time

# all parameters not specified are set to their defaults
y_col = '21137'

feature_cols = ['order_dow','order_hour_of_day']

feature_cols += [colname for colname in df.columns.values if 'department' in colname] # do not include departments
print('Features:')
print(feature_cols)

logisticRegr = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced')
logisticRegr.fit(df[feature_cols],df[y_col])

predictions = logisticRegr.predict(df[feature_cols])
df['predicted_'+y_col] = predictions
display(df[['predicted_'+predicted_col,y_col]])

print('predicted:')
display(df['predicted_'+y_col].value_counts())
print('actual:')
display(df[y_col].value_counts())
display(df[y_col].value_counts(normalize=True))

Features:
['order_dow', 'order_hour_of_day', "('department', 'alcohol')", "('department', 'babies')", "('department', 'bakery')", "('department', 'beverages')", "('department', 'breakfast')", "('department', 'bulk')", "('department', 'canned goods')", "('department', 'dairy eggs')", "('department', 'deli')", "('department', 'dry goods pasta')", "('department', 'frozen')", "('department', 'household')", "('department', 'international')", "('department', 'meat seafood')", "('department', 'missing')", "('department', 'other')", "('department', 'pantry')", "('department', 'personal care')", "('department', 'pets')", "('department', 'produce')", "('department', 'snacks')"]


Unnamed: 0,predicted_13176,21137
0,0,0
1,1,0
2,0,0
3,0,0
4,1,0
...,...,...
131204,0,0
131205,1,1
131206,0,0
131207,1,1


predicted:


0    95288
1    35921
Name: predicted_21137, dtype: int64

actual:


0    120315
1     10894
Name: 21137, dtype: int64

0    0.916972
1    0.083028
Name: 21137, dtype: float64

In [28]:
# Use score method to get accuracy of model
score = logisticRegr.score(df[feature_cols], df[y_col])
print(score)

cm = metrics.confusion_matrix(df[y_col], df['predicted_'+y_col])
print(cm)
#                       Predicted not-y,0   Predicted y,1
# True not-y,0
# True y,1

0.7539116981304637
[[91657 28658]
 [ 3631  7263]]


In [29]:
coefs = pd.DataFrame(list(zip(logisticRegr.coef_[0], feature_cols)),columns=['coef','feature'])
display(coefs.sort_values('coef'))

# coef_dict = {}
# for coef, feat in zip(logisticRegr.coef_,feature_cols):
#     coef_dict[feat] = coef
# display(coef_dict)

Unnamed: 0,coef,feature
2,-0.498647,"('department', 'alcohol')"
8,-0.241914,"('department', 'canned goods')"
14,-0.206668,"('department', 'international')"
13,-0.150525,"('department', 'household')"
18,-0.114095,"('department', 'pantry')"
17,-0.102732,"('department', 'other')"
15,-0.100236,"('department', 'meat seafood')"
20,-0.090289,"('department', 'pets')"
5,-0.061589,"('department', 'beverages')"
19,-0.052931,"('department', 'personal care')"


In [32]:
# export big organic strawberries model to pickle file
filename = 'saved_models/model_21137.pkl'
pickle.dump(logisticRegr, open(filename, 'wb'))

#check export worked
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model.summary2())

AttributeError: 'LogisticRegression' object has no attribute 'summary2'

In [21]:
# simple model for testing REST API

# all parameters not specified are set to their defaults
# feature_cols = ['order_number','order_dow','order_hour_of_day','days_since_prior_order']
# feature_cols += [colname for colname in df.columns.values if 'department' in colname] # do not include departments

feature_cols = ["days_since_prior_order","('department', 'alcohol')","('department', 'bulk')"]
print('Features:')
print(feature_cols)

logisticRegr = LogisticRegression(penalty='l1',solver='liblinear',class_weight='balanced')
logisticRegr.fit(df[feature_cols],df['any_organic'])

Features:
['days_since_prior_order', "('department', 'alcohol')", "('department', 'bulk')"]


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
# export model to pickle file
filename = 'saved_models/model_3predictors.pkl'
pickle.dump(logisticRegr, open(filename, 'wb'))

#check export worked
loaded_model = pickle.load(open(filename, 'rb'))
score = loaded_model.score(df[feature_cols], df['any_organic'])
print(score)
# see predictions work on a single set of numbers
input_vector = np.array([7,2,0])
print(loaded_model.predict(input_vector.reshape(1,-1)))
print(loaded_model.predict_proba(input_vector.reshape(1,-1)))
print(loaded_model.classes_)

0.7133199704288578
[False]
[[0.5391995 0.4608005]]
[False  True]


In [28]:
df[feature_cols].head(1)

Unnamed: 0,days_since_prior_order,"('department', 'alcohol')","('department', 'bulk')"
0,14.0,0,0


In [30]:
if not loaded_model.predict(input_vector.reshape(1,-1)):
    print('hello')

hello
