https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
# read csv from github directly
#url_dataset = 'https://raw.githubusercontent.com/madmashup/targeted-marketing-predictive-engine/master/banking.csv'
#df = pd.read_csv(url_dataset, nrows=50000)
#data = pd.read_csv(url_dataset)

# read csv from github directly
url_dataset = 'https://github.com/jjschueder/7331DataMiningNotebooks/blob/master/Live%20Assignment%201/df1hotmerge2.csv?raw=true'
#df = pd.read_csv(url_dataset, nrows=50000)
data = pd.read_csv(url_dataset, nrows=30000)

In [3]:
print(data.shape)
print(list(data.columns))

(30000, 68)
['Unnamed: 0', 'pack', 'bottle_volume_ml', 'state_bottle_cost', 'state_bottle_retail', 'bottles_sold', 'sale_dollars', 'volume_sold_liters', 'volume_sold_gallons', 'counter', 'liquor_category', 'store_parent', 'month', 'year', 'monthyear', 'liquor_category_AMARETTO', 'liquor_category_BRANDY', 'liquor_category_GIN', 'liquor_category_LIQUEUR', 'liquor_category_Other', 'liquor_category_RUM', 'liquor_category_SCHNAPPS', 'liquor_category_TEQUILA', 'liquor_category_VODKA', 'liquor_category_WHISKY', 'store_parent_CVS', 'store_parent_Caseys', 'store_parent_Hy-Vee', 'store_parent_Kum&Go', 'store_parent_Other', 'store_parent_QuikTrip', 'store_parent_SamsClub', 'store_parent_SmokingJoes', 'store_parent_Target', 'store_parent_Wal-Mart', 'store_parent_Walgreens', 'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov', 'month_Oct', 'month_Sep', 'year_2019', 'monthyear_Apr-2019', 'monthyear_Aug-2019', 'monthyear_Dec

In [4]:
data = data.drop(columns = ['Unnamed: 0'])

In [5]:
count_not_whiskey = len(data[data['liquor_category_WHISKY']==0])
count_whiskey = len(data[data['liquor_category_WHISKY']==1])
pct_of_no_whiskey = count_not_whiskey/(count_not_whiskey+count_whiskey)
print("percentage of not whiskey is", pct_of_no_whiskey*100)
pct_of_whiskey = count_whiskey/(count_not_whiskey+count_whiskey)
print("percentage of whiskey", pct_of_whiskey*100)

percentage of not whiskey is 78.21333333333334
percentage of whiskey 21.786666666666665


In [6]:
cat_vars=['counter', 'liquor_category', 'store_parent',
 'month', 'year', 'monthyear', 'liquor_category_AMARETTO', 'liquor_category_BRANDY', 'liquor_category_GIN', 
 'liquor_category_LIQUEUR', 'liquor_category_Other', 'liquor_category_RUM', 'liquor_category_SCHNAPPS', 
 'liquor_category_TEQUILA', 'liquor_category_VODKA', 'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb',
 'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov', 'month_Oct', 'month_Sep']
data_vars=data.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]

In [7]:
to_keep

['pack',
 'bottle_volume_ml',
 'state_bottle_cost',
 'state_bottle_retail',
 'bottles_sold',
 'sale_dollars',
 'volume_sold_liters',
 'volume_sold_gallons',
 'liquor_category_WHISKY',
 'store_parent_CVS',
 'store_parent_Caseys',
 'store_parent_Hy-Vee',
 'store_parent_Kum&Go',
 'store_parent_Other',
 'store_parent_QuikTrip',
 'store_parent_SamsClub',
 'store_parent_SmokingJoes',
 'store_parent_Target',
 'store_parent_Wal-Mart',
 'store_parent_Walgreens',
 'year_2019',
 'monthyear_Apr-2019',
 'monthyear_Aug-2019',
 'monthyear_Dec-2019',
 'monthyear_Feb-2019',
 'monthyear_Jan-2019',
 'monthyear_Jul-2019',
 'monthyear_Jun-2019',
 'monthyear_Mar-2019',
 'monthyear_May-2019',
 'monthyear_Nov-2019',
 'monthyear_Oct-2019',
 'monthyear_Sep-2019',
 'sale_dollars_trans',
 'cost_per_liter',
 'cost_per_liter_trans',
 'state_bottle_cost_trans',
 'bottles_sold_trans',
 'volume_sold_liters_trans',
 'grossmargin']

In [8]:
data_final=data[to_keep]
data_final.columns.values

array(['pack', 'bottle_volume_ml', 'state_bottle_cost',
       'state_bottle_retail', 'bottles_sold', 'sale_dollars',
       'volume_sold_liters', 'volume_sold_gallons',
       'liquor_category_WHISKY', 'store_parent_CVS',
       'store_parent_Caseys', 'store_parent_Hy-Vee',
       'store_parent_Kum&Go', 'store_parent_Other',
       'store_parent_QuikTrip', 'store_parent_SamsClub',
       'store_parent_SmokingJoes', 'store_parent_Target',
       'store_parent_Wal-Mart', 'store_parent_Walgreens', 'year_2019',
       'monthyear_Apr-2019', 'monthyear_Aug-2019', 'monthyear_Dec-2019',
       'monthyear_Feb-2019', 'monthyear_Jan-2019', 'monthyear_Jul-2019',
       'monthyear_Jun-2019', 'monthyear_Mar-2019', 'monthyear_May-2019',
       'monthyear_Nov-2019', 'monthyear_Oct-2019', 'monthyear_Sep-2019',
       'sale_dollars_trans', 'cost_per_liter', 'cost_per_liter_trans',
       'state_bottle_cost_trans', 'bottles_sold_trans',
       'volume_sold_liters_trans', 'grossmargin'], dtype=object)

In [9]:
X = data_final.loc[:, data_final.columns != 'liquor_category_WHISKY']
y = data_final.loc[:, data_final.columns == 'liquor_category_WHISKY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

In [10]:
X

Unnamed: 0,pack,bottle_volume_ml,state_bottle_cost,state_bottle_retail,bottles_sold,sale_dollars,volume_sold_liters,volume_sold_gallons,store_parent_CVS,store_parent_Caseys,...,monthyear_Nov-2019,monthyear_Oct-2019,monthyear_Sep-2019,sale_dollars_trans,cost_per_liter,cost_per_liter_trans,state_bottle_cost_trans,bottles_sold_trans,volume_sold_liters_trans,grossmargin
0,20,375,3.85,5.78,20,115.60,7.50,1.98,0,0,...,1,0,0,4.750136,15.413333,2.735233,1.348073,2.995732,2.014903,0.333910
1,8,50,8.75,13.13,1,13.13,0.05,0.01,0,0,...,1,0,0,2.574900,262.600000,5.570632,2.169054,0.000000,-2.995732,0.333587
2,12,1000,16.50,24.75,6,148.50,6.00,1.58,0,0,...,0,0,0,5.000585,24.750000,3.208825,2.803360,1.791759,1.791759,0.333333
3,6,750,21.17,31.76,24,762.24,18.00,4.75,0,0,...,1,0,0,6.636261,42.346667,3.745890,3.052585,3.178054,2.890372,0.333438
4,6,1750,9.31,13.97,12,167.64,21.00,5.54,0,0,...,1,0,0,5.121819,7.982857,2.077296,2.231089,2.484907,3.044522,0.333572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,24,375,1.93,2.90,24,69.60,9.00,2.37,0,0,...,0,0,0,4.242765,7.733333,2.045540,0.657520,3.178054,2.197225,0.334483
29996,24,375,1.93,2.90,24,69.60,9.00,2.37,0,1,...,0,0,0,4.242765,7.733333,2.045540,0.657520,3.178054,2.197225,0.334483
29997,12,750,5.38,8.07,12,96.84,9.00,2.37,0,0,...,0,0,0,4.573060,10.760000,2.375836,1.682688,2.484907,2.197225,0.333333
29998,12,750,3.57,5.36,12,64.32,9.00,2.37,0,0,...,0,0,0,4.163871,7.146667,1.966646,1.272566,2.484907,2.197225,0.333955


In [11]:
#possibility to scale it?
from sklearn import preprocessing
X_train2 = preprocessing.scale(X_train)
X_test2 = preprocessing.scale(X_test)

# all parameters not specified are set to their defaults
#https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn/
from sklearn.svm import SVC
#svclassifier = SVC(kernel='poly', degree=8)
#svclassifier = SVC(kernel='rbf')
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train2, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
# Returns a NumPy Array
predictions = svclassifier.predict(X_test)

ROC Curves and AUC in Python
https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/

Precision-Recall Curves in Python

When to Use ROC vs. Precision-Recall Curves?
Generally, the use of ROC curves and precision-recall curves are as follows:

ROC curves should be used when there are roughly equal numbers of observations for each class.
Precision-Recall curves should be used when there is a moderate to large class imbalance.

In [15]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[6876  185]
 [1863   76]]
              precision    recall  f1-score   support

           0       0.79      0.97      0.87      7061
           1       0.29      0.04      0.07      1939

    accuracy                           0.77      9000
   macro avg       0.54      0.51      0.47      9000
weighted avg       0.68      0.77      0.70      9000

