In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
data_total=pd.read_csv('data_sales_weather.csv')
data_total.head()

Unnamed: 0,StockCode,Description,Quantity,Price,dt_iso,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,6.95,2009-12-01 07:45:00,2.38,0.96,-0.25,2.5,1013,85,1.5,0,20,801,Clouds,few clouds,02n
1,79323P,PINK CHERRY LIGHTS,12,6.75,2009-12-01 07:45:00,2.38,0.96,-0.25,2.5,1013,85,1.5,0,20,801,Clouds,few clouds,02n
2,79323W,WHITE CHERRY LIGHTS,12,6.75,2009-12-01 07:45:00,2.38,0.96,-0.25,2.5,1013,85,1.5,0,20,801,Clouds,few clouds,02n
3,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2.1,2009-12-01 07:45:00,2.38,0.96,-0.25,2.5,1013,85,1.5,0,20,801,Clouds,few clouds,02n
4,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.25,2009-12-01 07:45:00,2.38,0.96,-0.25,2.5,1013,85,1.5,0,20,801,Clouds,few clouds,02n


In [3]:
data_total.shape

(948321, 18)

In [4]:
data_total['StockCode'].nunique()

5303

In [5]:
#The stock code column contains 5303 different categories (each a unique product), 
#I need to focus on the products sold most often and reduce the number of categories.

In [6]:
#sns.countplot(x='StockCode', data=data_total)

In [7]:
counts_total = data_total['StockCode'].value_counts()
data = data_total.loc[data_total['StockCode'].isin(counts_total.index[counts_total > 1500])]
print(data.shape)
data['StockCode'].nunique()

(112107, 18)


54

In [8]:
#After filtering, I have 54 categories left (manageable), and 112107 sales (enough to train my model), which allows me to process the data. 

# Let's treat the numerical data for modelling

In [9]:
data.dtypes

StockCode               object
Description             object
Quantity                 int64
Price                  float64
dt_iso                  object
temp                   float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
humidity                 int64
wind_speed             float64
wind_deg                 int64
clouds_all               int64
weather_id               int64
weather_main            object
weather_description     object
weather_icon            object
dtype: object

In [10]:
#I'll first build a model where the key information is temperature (will later build one for wind speed, and another for weather_description)

In [11]:
X = data.drop(['temp','temp_min','temp_max','Description','dt_iso','weather_icon'],axis=1)
y = data['temp']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
numericalX_train = X_train.select_dtypes(np.number)
numericalX_test = X_test.select_dtypes(np.number)

categoricalX_train = X_train.select_dtypes(object)
categoricalX_test = X_test.select_dtypes(object)

In [14]:
transformer = MinMaxScaler().fit(numericalX_train)
X_train_normalized = transformer.transform(numericalX_train)
X_test_normalized = transformer.transform(numericalX_test)

X_train_normalized = pd.DataFrame(X_train_normalized, columns=numericalX_train.columns)
X_test_normalized = pd.DataFrame(X_test_normalized, columns=numericalX_train.columns)

In [15]:
encoder = OneHotEncoder(drop='first', handle_unknown = 'ignore').fit(categoricalX_train)
encoded_categorical_train = encoder.transform(categoricalX_train).toarray()
encoded_categorical_train = pd.DataFrame(encoded_categorical_train)
encoded_categorical_test = encoder.transform(categoricalX_test).toarray()
encoded_categorical_test = pd.DataFrame(encoded_categorical_test)

In [16]:
X_train_treated=pd.concat([X_train_normalized, encoded_categorical_train], axis=1 )
X_test_treated=pd.concat([X_test_normalized, encoded_categorical_test], axis=1 )

In [17]:
%%time

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor()
from sklearn.linear_model import LinearRegression
model2 = LinearRegression()
from sklearn.neighbors import KNeighborsRegressor
model3 = KNeighborsRegressor()

import numpy as np

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_treated, y_train, cv=10))
    scores[model_name] = mean_score
print(scores)





{'Decision Tree Regressor': 0.9999962066152017, 'Linear Regression': 0.9921827355794939, 'KNN': 0.9151846012709557}
CPU times: user 3min 53s, sys: 1min 28s, total: 5min 22s
Wall time: 2min 9s


In [18]:
#Let's test out the random forest regressor as well, it might give us more reliable results than the Decision Tree, before we make a decision on the best model. 

In [21]:
%%time

>>> from sklearn.ensemble import RandomForestRegressor
>>> from sklearn.datasets import make_regression
>>> X_train_treated, y_train = make_regression(n_features=87,
...                        random_state=0, shuffle=False)
>>> regr = RandomForestRegressor(max_depth=10, random_state=0)
>>> regr.fit(X_train_treated, y_train)
y_pred = regr.predict(X_test_treated)
print(regr.score(X_train_treated, y_train))
print(regr.score(X_test_treated, y_test))

0.9148578026798507
-5.004094866717746
CPU times: user 341 ms, sys: 7.67 ms, total: 348 ms
Wall time: 346 ms




In [22]:
%%time

regr = DecisionTreeRegressor(max_depth=10,
                             criterion = 'mse',
                             min_samples_split=2,
                             min_samples_leaf = 1, 
                             max_features = 10)
model = regr.fit(X_train_treated, y_train)
print("test data accuracy was: ",regr.score(X_test_treated, y_test))
print("train data accuracy was: ",regr.score(X_train_treated, y_train))

test data accuracy was:  -9.525940048822461
train data accuracy was:  0.9943557381473264
CPU times: user 13.5 ms, sys: 5.4 ms, total: 18.9 ms
Wall time: 17.3 ms


