## Import libraries and config Pandas display

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn 
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error, r2_score

In [2]:
pd.options.display.max_columns = 25
pd.options.display.max_rows = 100

## Import datasets

In [3]:
winery = pd.read_csv("../data/Winery_Data_Clean.csv")
winery.head()

Unnamed: 0,CustomerID,OrderID,CustomerSegment,Date,ZipCode,State,Sales2008,Sales2009,Sales2010,SaleAmount,Orders2008,Orders2009,Orders2010,YearAcquired,EmailSubscr,NewsletterSubscr,WinemakerCallSubscr,Division,Region,Channel
0,1,1532,High Roller,2008-07-08,33467,FL,213.0,30903.1,13340.94,44.0,4,8,4,2008,True,True,True,South Atlantic,South,TastingRoom
1,1,14378,High Roller,2008-10-05,33467,FL,213.0,30903.1,13340.94,47.0,4,8,4,2008,True,True,True,South Atlantic,South,TastingRoom
2,1,17690,High Roller,2008-10-26,33467,FL,213.0,30903.1,13340.94,57.0,4,8,4,2008,True,True,True,South Atlantic,South,Newsletter
3,1,19808,High Roller,2008-11-08,33467,FL,213.0,30903.1,13340.94,65.0,4,8,4,2008,True,True,True,South Atlantic,South,TastingRoom
4,1,25406,High Roller,2009-01-02,33467,FL,213.0,30903.1,13340.94,3889.0,4,8,4,2008,True,True,True,South Atlantic,South,TastingRoom


In [4]:
customer = pd.read_csv("../data/Winery_Customer.csv")
customer.head()

Unnamed: 0,CustomerID,OrderVolume,CustomerSegment,ZipCode,State,SaleAmount,YearAcquired,EmailSubscr,NewsletterSubscr,WinemakerCallSubscr,Division,Region
0,1,16,High Roller,33467,FL,44457.05,2008,True,True,True,South Atlantic,South
1,2,9,High Roller,98683,WA,42201.67,2009,False,True,True,Pacific,West
2,3,8,High Roller,90247,CA,28393.5,2005,True,True,True,Pacific,West
3,4,7,Wine Enthusiast,4572,ME,23571.14,2008,True,True,True,New England,Northeast
4,5,5,Casual Visitor,98042,WA,20631.1,2008,False,False,False,Pacific,West


In [10]:
customer.dtypes

CustomerID               int64
OrderVolume              int64
CustomerSegment         object
ZipCode                  int64
State                   object
SaleAmount             float64
YearAcquired             int64
EmailSubscr               bool
NewsletterSubscr          bool
WinemakerCallSubscr       bool
Division                object
Region                  object
dtype: object

## Predictive Models

### Binary Classification

#### Random Forest

In [33]:
y = customer.loc[:, "EmailSubscr"].values
X = customer.loc[:, ["OrderVolume", "CustomerSegment", "Region", "SaleAmount", "NewsletterSubscr", "WinemakerCallSubscr"]]


pandas.core.frame.DataFrame

In [38]:
### TODO: manually define categorical encoding

label_encoder = LabelEncoder()
x_categorical = X.select_dtypes(include=['object', 'bool']).apply(label_encoder.fit_transform)
# x_categorical

x_numerical = X.select_dtypes(exclude=['object', 'bool']).values
# x_numerical

x = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis='columns').values
# print(type(x))

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=0)

print(len(X_train), len(X_test), len(Y_train), len(Y_test))



17579 4395 17579 4395


In [53]:
regressor = RandomForestRegressor(random_state=0)
regressor.fit(X_train, Y_train)

predictions = regressor.predict(X_test)
r2 = r2_score(Y_test, predictions)

feature_importances = regressor.feature_importances_

for feature, importance in zip(["OrderVolume", "CustomerSegment", "Region", "SaleAmount", "NewsletterSubscr", "WinemakerCallSubscr"]
                               , feature_importances):
    print(f"{feature}: {importance:.4f}")

accuracy = accuracy_score(Y_test, np.round(predictions))
precision = precision_score(Y_test, np.round(predictions))
recall = recall_score(Y_test, np.round(predictions))
f1score = f1_score(Y_test, np.round(predictions))

print("Acc", accuracy, "Prec", precision, "Rec", recall, "f1", f1score)

OrderVolume: 0.0398
CustomerSegment: 0.0983
Region: 0.0294
SaleAmount: 0.0198
NewsletterSubscr: 0.1271
WinemakerCallSubscr: 0.6856
Acc 0.9342434584755404 Prec 0.8813892529488859 Rec 0.925671025464556 f1 0.9029875797247399


In [49]:
print(Y_test)
print(np.round(predictions))


[False  True False ... False  True  True]
[0. 1. 0. ... 0. 1. 1.]


### Multinomial Classification