In [5]:
# Update sklearn to prevent version mismatches
#!conda install scikit-learn
#!conda update scikit-learn
#!conda install joblib 
#!conda update joblib 

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [7]:
###
features_df = pd.read_csv("../Resources/features.csv")
stores_df = pd.read_csv("../Resources/stores.csv")

###
train_df = pd.read_csv("../Resources/train.csv")
test_df = pd.read_csv("../Resources/test.csv")

###
sampleSubmission_df = pd.read_csv("../Resources/sampleSubmission.csv")

features_df.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [8]:
stores_df.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [9]:
train_df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [10]:
test_df.head()

Unnamed: 0,Store,Dept,Date,IsHoliday
0,1,1,2012-11-02,False
1,1,1,2012-11-09,False
2,1,1,2012-11-16,False
3,1,1,2012-11-23,True
4,1,1,2012-11-30,False


In [11]:
sampleSubmission_df.head()

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,0
1,1_1_2012-11-09,0
2,1_1_2012-11-16,0
3,1_1_2012-11-23,0
4,1_1_2012-11-30,0


In [12]:
###
### convert datatype for column 'Date' from string (default) to datetime
train_df['Date'] =pd.to_datetime(train_df['Date'], format="%Y-%m-%d")
features_df['Date'] =pd.to_datetime(features_df['Date'], format="%Y-%m-%d")
test_df['Date'] = pd.to_datetime(test_df['Date'], format="%Y-%m-%d")


### convert to proper datatype
### convert datatype for column 'Weekly_Sales' from string (default) to float
train_df['Weekly_Sales'] = train_df['Weekly_Sales'].astype(float)
features_df['Temperature'] = features_df['Temperature'].astype(float)
features_df['Fuel_Price'] = features_df['Fuel_Price'].astype(float)
features_df['CPI'] = features_df['CPI'].astype(float)
features_df['Unemployment'] = features_df['Unemployment'].astype(float)
#test_df['Weekly_Sales'] = test_df['Weekly_Sales'].astype(float)



### merge train_df and stores_df, merge test_df and stores_df
combined_train_df = pd.merge(train_df,stores_df,how='left',on='Store')
combined_test_df = pd.merge(test_df,stores_df,how='left',on='Store')


### merge combined_train_df and features_df, merge combined_test_df and features_df
combined_train_df = pd.merge(combined_train_df, features_df, how = "inner", on=["Store","Date",'IsHoliday'])
combined_test_df = pd.merge(combined_test_df, features_df, how = "inner", on=["Store","Date",'IsHoliday'])


### 
combined_train_df.fillna(0,inplace=True)
combined_test_df.fillna(0,inplace=True)

combined_train_df.to_json("../Resources/combined_train_data.json")
combined_test_df.to_json("../Resources/combined_test_data.json")

combined_train_df.head()
#combined_test_df.head()



Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
1,1,2,2010-02-05,50605.27,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
2,1,3,2010-02-05,13740.12,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
3,1,4,2010-02-05,39954.04,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
4,1,5,2010-02-05,32229.38,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106


In [13]:
combined_test_df.head()

Unnamed: 0,Store,Dept,Date,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2012-11-02,False,A,151315,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.462779,6.573
1,1,2,2012-11-02,False,A,151315,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.462779,6.573
2,1,3,2012-11-02,False,A,151315,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.462779,6.573
3,1,4,2012-11-02,False,A,151315,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.462779,6.573
4,1,5,2012-11-02,False,A,151315,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.462779,6.573


In [14]:
combined_train_df.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
count,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0
mean,22.200546,44.260317,15981.258123,136727.915739,60.090059,3.361027,2590.074819,879.974298,468.087665,1083.132268,1662.772385,171.201947,7.960289
std,12.785297,30.492054,22711.183519,60980.583328,18.447931,0.458515,6052.385934,5084.538801,5528.873453,3894.529945,4207.629321,39.159276,1.863296
min,1.0,1.0,-4988.94,34875.0,-2.06,2.472,0.0,-265.76,-29.1,0.0,0.0,126.064,3.879
25%,11.0,18.0,2079.65,93638.0,46.68,2.933,0.0,0.0,0.0,0.0,0.0,132.022667,6.891
50%,22.0,37.0,7612.03,140167.0,62.09,3.452,0.0,0.0,0.0,0.0,0.0,182.31878,7.866
75%,33.0,74.0,20205.8525,202505.0,74.28,3.738,2809.05,2.2,4.54,425.29,2168.04,212.416993,8.572
max,45.0,99.0,693099.36,219622.0,100.14,4.468,88646.76,104519.54,141630.61,67474.85,108519.28,227.232807,14.313


In [15]:
combined_train_df.dtypes

Store                    int64
Dept                     int64
Date            datetime64[ns]
Weekly_Sales           float64
IsHoliday                 bool
Type                    object
Size                     int64
Temperature            float64
Fuel_Price             float64
MarkDown1              float64
MarkDown2              float64
MarkDown3              float64
MarkDown4              float64
MarkDown5              float64
CPI                    float64
Unemployment           float64
dtype: object

In [16]:
combined_train_df['Weekly_Sales'][combined_train_df['Weekly_Sales'] < 0] = 0
combined_train_df['MarkDown2'][combined_train_df['MarkDown2'] < 0] = 0
combined_train_df['MarkDown3'][combined_train_df['MarkDown3'] < 0] = 0

In [12]:
###
### working df

###
df = combined_train_df
# Drop the null columns where all values are null
#df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()
#df.columns


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
1,1,2,2010-02-05,50605.27,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
2,1,3,2010-02-05,13740.12,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
3,1,4,2010-02-05,39954.04,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106
4,1,5,2010-02-05,32229.38,False,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106


In [21]:
#testing = pd.get_dummies(df)
#testing.head()

# Create a Train Test Split

Use `koi_disposition` for the y values

In [63]:
from sklearn.model_selection import train_test_split

#y = df["Weekly_Sales"].values.reshape(-1, 1)
#X = df["Date"].values.reshape(-1, 1)
#X[0][0]

y = df["Weekly_Sales"]
##X[0]
X = df.drop(columns=["Weekly_Sales"])
#y = df["koi_disposition"]
#X = df.drop(columns=["koi_disposition"])
#y
X


#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

Unnamed: 0,Store,Dept,Date,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
1,1,2,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
2,1,3,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
3,1,4,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
4,1,5,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
5,1,6,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
6,1,7,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
7,1,8,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
8,1,9,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106
9,1,10,2010-02-05,False,A,151315,42.31,2.572,0.00,0.00,0.0,0.00,0.00,211.096358,8.106


In [39]:
X_train.head()

NameError: name 'X_train' is not defined

# Pre-processing

Scale the data using the MinMaxScaler

In [9]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


# Train the Model

In [10]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.6649893260140287
Testing Data Score: 0.6514181152790485


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [12]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [13]:
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.8820301783264746, total=   1.4s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ........ C=1, penalty=l1, score=0.8824336688014639, total=   2.0s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.5s remaining:    0.0s


[CV] ........ C=1, penalty=l1, score=0.8782608695652174, total=   1.5s
[CV] C=1, penalty=l2 .................................................




[CV] ........ C=1, penalty=l2, score=0.6753543667123914, total=   0.8s
[CV] C=1, penalty=l2 .................................................




[CV] ........ C=1, penalty=l2, score=0.6765782250686185, total=   1.2s
[CV] C=1, penalty=l2 .................................................




[CV] ........ C=1, penalty=l2, score=0.6324942791762014, total=   1.0s
[CV] C=5, penalty=l1 .................................................
[CV] ......... C=5, penalty=l1, score=0.877914951989026, total=   1.4s
[CV] C=5, penalty=l1 .................................................




[CV] ........ C=5, penalty=l1, score=0.8847209515096066, total=   2.8s
[CV] C=5, penalty=l1 .................................................




[CV] ........ C=5, penalty=l1, score=0.8814645308924485, total=   3.7s
[CV] C=5, penalty=l2 .................................................




[CV] ......... C=5, penalty=l2, score=0.691358024691358, total=   1.1s
[CV] C=5, penalty=l2 .................................................




[CV] ......... C=5, penalty=l2, score=0.672003659652333, total=   1.1s
[CV] C=5, penalty=l2 .................................................




[CV] ........ C=5, penalty=l2, score=0.6398169336384439, total=   1.1s
[CV] C=10, penalty=l1 ................................................
[CV] ....... C=10, penalty=l1, score=0.8788294467306813, total=   1.4s
[CV] C=10, penalty=l1 ................................................




[CV] ....... C=10, penalty=l1, score=0.8851784080512352, total=   2.8s
[CV] C=10, penalty=l1 ................................................




[CV] ....... C=10, penalty=l1, score=0.8823798627002288, total=   3.7s
[CV] C=10, penalty=l2 ................................................




[CV] ....... C=10, penalty=l2, score=0.6767261088248743, total=   1.1s
[CV] C=10, penalty=l2 ................................................




[CV] ....... C=10, penalty=l2, score=0.6678865507776761, total=   1.9s
[CV] C=10, penalty=l2 ................................................


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   33.4s finished


[CV] ........ C=10, penalty=l2, score=0.631121281464531, total=   1.6s


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [14]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'penalty': 'l1'}
0.8821286977737115


# Save the Model

In [8]:
# save fitted model to file
import joblib
filename = '../output/logistic_for_walmart.sav'
joblib.dump(grid, filename)

NameError: name 'grid' is not defined