# Introduction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("./data_raw.csv", parse_dates=['week'])
df.head()

In [None]:
# Create a pie chart of featured products on the main page
featured_count = df['feat_main_page'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(featured_count, labels=['Not Featured', 'Featured'], autopct='%1.1f%%', colors=['lightcoral', 'lightskyblue'])
plt.title('Featured Products on Main Page')
plt.show()


In [None]:
# Count the number of occurrences for each functionality category
functionality_counts = df['functionality'].value_counts()
colors = ['lightblue', 'lightgreen', 'lightcoral', 'lightsalmon']

# Create a bar plot of functionality features
plt.figure(figsize=(12, 6))
plt.bar(functionality_counts.index, functionality_counts.values, color=colors)
plt.xlabel('Functionality Features')
plt.ylabel('Count')
plt.title('Functionality Features Distribution')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Create a cross-tabulation of feat_main_page vs functionality
cross_tab = pd.crosstab(df['feat_main_page'], df['functionality'])

# Create a grouped bar chart
cross_tab.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='Set3')

# Set labels and title
plt.xlabel('Featured on Main Page')
plt.ylabel('Count')
plt.title('Relation between "feat_main_page" and "functionality feature"')

# Adjust the legend
plt.legend(title='Functionality', loc='upper left', bbox_to_anchor=(1, 1))

# Show the chart
plt.show()


In [None]:
# Filter the dataframe for rows where feat_main_page is True
filtered_df = df[df['feat_main_page'] == True]

# Count the occurrences of each functionality feature
functionality_counts = filtered_df['functionality'].value_counts()

# Create a bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=functionality_counts.index, y=functionality_counts.values, palette='Set2')

# Set labels and title
plt.xlabel('Functionality')
plt.ylabel('Count')
plt.title('Functionality Features on "feat_main_page"')

# Rotate x-axis labels if needed
plt.xticks(rotation=45, ha='right')

# Show the chart
plt.show()


In [None]:
# Set 'week' column as the dataframe's index
df.set_index('week', inplace=True)

# Resample the data on a weekly basis and sum the sales
weekly_sales = df['weekly_sales'].resample('W').sum()

# Plotting the sales trend using line plot
weekly_sales.plot(kind='line', marker='o', figsize=(10, 6))

# Set labels and title
plt.xlabel('Week')
plt.ylabel('Sales')
plt.title('Weekly Sales Trend')

# Show the chart
plt.show()

In [None]:
# df['week'][0]

# Data Pre-Processing
Its crucial to process raw data in order to extract a much predictive power from features available in data.

### Dealing with missing data

In [None]:
df.isna().any()

In [None]:
df[df['color'].isnull()]

There are 4 sku with missing values for features color

In [None]:
df[df.sku == 44]['color'].value_counts(dropna=False)

Method 1: Delete rows with missing values

In [None]:
df.dropna()

This is the fastest method, but it reduces the size of the dataset

Method 2: Imputation libraries

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
# For each SKU with missing data, we first fit the imputer and then apply it to the
# rows that contain missing data:
imputer.fit(df[df.sku==44][['sku', 'color']])

In [None]:
df[(df.sku == 43) & (df.color.isna())]

In [None]:
imputer.transform(df[(df.sku == 44) & (df.color.isna())][['sku', 'color']])

In [None]:
df[(df.color.isna()) & (df.sku==44)].index.values

In [None]:
sku_number = [9, 42, 43, 44]
for i in sku_number:
    missing_idx = df[(df.color.isna()) & (df.sku==i)].index.values
    for j in missing_idx:
        df.at[j, 'color'] = imputer.transform(df[(df.color.isna()) & (df.sku==i)][['sku', 'color']])[0, 1]

In [None]:
df[df['color'].isnull()].sum(numeric_only=True)

### Test for outlier

In [None]:
def check_outlier(df, features, k=5):
    data = df.copy()
    for f in features:
        data['outlier_'+f] = data.groupby('sku')[f].transform(
        lambda x: (x > (x.mean()+k*x.std())) | (x < (x.mean()-k*x.std())))
    return data

In [None]:
df = check_outlier(df, ['price', 'weekly_sales'], 5)
df[df.outlier_price]

In [None]:
df[df.outlier_weekly_sales]

In [None]:
df['weekly_sales'].plot.box()

In [None]:
df['price'].plot.box()

### Time Effect
Generally, there are four types of time series components: trend,
seasonal variations, cyclical fluctuations, and irregular variations

In the context of retail demand prediction, it is common to focus only on the two variables: Trend and Seasonality


In [None]:
df.head()

In [None]:
df['trend'] = df['week'].dt.year - 2016
df['month'] = df['week'].dt.month

In [None]:
df['trend'].unique()

In [None]:
df = pd.get_dummies(data=df, columns=['month'], drop_first=True)
df.head()

In [None]:
df

### Price and lag-Prices

In [None]:
# To create lag-price we use shift function
df['price-1'] = df.groupby(['sku'])['price'].shift(1)
df['price-2'] = df.groupby(['sku'])['price'].shift(2)
df.dropna(subset=['price-1', 'price-2'], inplace=True)
df.head()

In [None]:
## Put lag-prices next to the price column

col = df.pop('price') #pop deletes the column
df.insert(3, col.name, col) #insert a column at a specific position
pos_price=df.columns.get_loc('price') #get position of column


In [None]:
#p-1
col = df.pop('price-1')
df.insert(pos_price+1, col.name, col)

In [None]:
#p-2
col = df.pop('price-2')
df.insert(pos_price+2, col.name, col)


In [None]:

df.head()

### Feature on main page
the company can decide to boost the visibility of specific
products by featuring them on the main website’s homepage (typically for 1 full
week).

we cannot directly use such variables with sklearn. Thus,
we decide to make this variable numerical by assigning a value of 1 to the
SKU-week pairs that are featured on the main page and 0 to others:

In [None]:
df['feat_main_page'] = df.feat_main_page.astype('int')

### Item descriptive feature

In [None]:
df = pd.get_dummies(data=df, 
                    columns=['functionality', 'color', 'vendor'],
                   drop_first=True)
df.head()

### Scaling

When dealing with features that have different ranges of values, it can often be desirable to scale (or normalize) the features in the dataset, so they all lie in a similar range.

To write Math in markdown
http://www.evanlray.com/stat242_f2019/resources/R/MathinRmd.html

Standard scaling will scale a feature x to a normalized version z with mean 0 and standard deviation 1.8.
$z=\frac{x - \mu}{\sigma}$

```
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data)
scaler.transform(data)
```

Min Max scaling will scale a feature x to a normalized version z that takes values between 0 and 1.

$z=\frac{x - min(x)}{max(x) - min(x)}$

where the minimum and maximum functions can either be taken for each SKU
separately or jointly for all the SKUs. This scaling can be performed using the following code:

```
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data)
scaler.transform(data)
```

### Sorting and exporting the dataset

In [None]:
df = df.sort_values(by=['sku', 'week'])
# sales.to_csv(‘data_processed.csv’,index=False)

At this stage, we have a fully processed dataset, and we are ready to proceed with the step of predicting demand.

# Demand Prediction Methods

In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv("./data_processed.csv")
df   

In [None]:
for i in df.columns:
    if i not in ['week', 'weekly_sales','sku']:
        print(i)

In [None]:
# Basic linear regression for one sku
data = df[df.sku==11].sort_values(by=['week'])
colnames = [i for i in data.columns if i not in
            ['week','weekly_sales','sku']]

In [None]:
X_primer = data[colnames]
y_primer = data.weekly_sales

we need to perform a time-based split to create a
training set and a test set.
* The training set contains the data from November 2016 to February 2018 (i.e.,68 weeks, 70% of the data).
* The test set contains the data from March 2018 to September 2018 (i.e., 30 weeks, 30% of the data).

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X_primer, 
                                   shuffle=False,
                                   train_size=0.70)
y_train, y_test = train_test_split(y_primer, 
                                   shuffle=False,
                                   train_size=0.70)

print(X_test.shape, y_test.shape)

In [None]:
from statsmodels.regression.linear_model import OLS
model = OLS(y_train, X_train)
model = model.fit()
y_pred = list(model.predict(X_test))

In [None]:
## Evaluate our model
from sklearn.metrics import r2_score, mean_squared_error
print('OOS R2:', round(r2_score(y_test, np.array(y_pred)), 3))
print('OOS MSE:', round(mean_squared_error(y_test, np.array(y_pred)), 3))

In [None]:
model.summary()

### Considering all 44 sku

In [None]:
skuSet = list(df.sku.unique())
skuData = {}

# removing dates, target variable, sku number
colnames = [i for i in df.columns if i not in ['week', 'weekly_sales','sku']]

for i in skuSet:
    df_i = df[df.sku == i]
#     print(df_i[colnames].values)
    skuData[i] = {'X': df_i[colnames].values,
                 'y': df_i.weekly_sales.values}

$ \text{weekly_sales}_i = {\beta_{\text{price}_i} \cdot \text{X}_{\text{price}_i} + \beta_{\text{price} - 1 i} \cdot \text{X}_{\text{price}-1 i} + \ldots + \beta_{\text{vendor10}_i} \cdot \text{X}{\text{vendor10}_i} + \varepsilon_i } $


In [None]:
X_dict={}
y_dict = {}

y_test = []
y_train = []

for i in skuSet:
    X_train_i,X_test_i = np.split(skuData[i]['X'], [68]) #split for X
    y_train_i,y_test_i = np.split(skuData[i]['y'], [68]) #split for y
   
    X_dict[i] = {'train': X_train_i, 'test': X_test_i} #filling dictionary
    y_dict[i] = {'train': y_train_i, 'test': y_test_i}
    
    y_test += list(y_test_i) #creating the complete testing array
    y_train += list(y_train_i) #creating the complete training array

## Centrailized Approach : Training single linear regression model

In [None]:
X_cen_train = X_dict[skuSet[0]]['train']
X_cen_test = X_dict[skuSet[0]]['test']

for i in skuSet[1:]:
    X_cen_train = np.concatenate((X_cen_train, X_dict[i]['train']), 
                                 axis= 0)
    X_cen_test = np.concatenate((X_cen_test, X_dict[i]['test']), 
                                axis=0)

In [None]:
from sklearn.linear_model import LinearRegression

model_cen = LinearRegression().fit(X_cen_train, y_train)

print('OOS R2:', round(r2_score(y_test, model_cen.predict(X_cen_test)),3))
print('OOS MSE:',round(mean_squared_error(y_test, model_cen.predict(X_cen_test)),3))

## Decentralized Approach:  estimated different linear regression model for each SKU

In [None]:
import time
tzero = time.time()

y_pred = []
skumodels = {}

for i in skuSet:
    model_i = OLS(y_dict[i]['train'], X_dict[i]['train'], hasconst=False)
    skumodels[i] = model_i.fit()
    
    #compute and concatenate prediction of the model i on item i
    y_pred += list(skumodels[i].predict(X_dict[i]['test']))

#computing overall performance metrics on y_pred and y_test:
print('OOS R2:',round(r2_score(y_test, np.array(y_pred)),3))
print('OOS MSE:', round(mean_squared_error(y_test, np.array(y_pred)),3))

t = time.time()-tzero
print('Time to compute:',round(t,3),' sec')

### Feature Selection and Regularization in the Decentralized model

# Tree based method

In [1]:
import numpy as np
import pandas as pd
df=pd.read_csv('data_processed.csv')

df.head()

Unnamed: 0,week,sku,weekly_sales,price,price-1,price-2,feat_main_page,trend,month_2,month_3,...,color_white,vendor_2,vendor_3,vendor_4,vendor_5,vendor_6,vendor_7,vendor_8,vendor_9,vendor_10
0,2016-11-14,1,110.0,10.24,9.86,10.16,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2016-11-21,1,127.0,8.27,10.24,9.86,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2016-11-28,1,84.0,8.83,8.27,10.24,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,2016-12-05,1,87.0,8.98,8.83,8.27,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2016-12-12,1,64.0,10.4,8.98,8.83,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [2]:
skuSet = list(df.sku.unique())
skuData = {}

# removing dates, target variable, sku number
colnames = [i for i in df.columns if i not in ['week', 'weekly_sales','sku']]

for i in skuSet:
    df_i = df[df.sku == i]
#     print(df_i[colnames].values)
    skuData[i] = {'X': df_i[colnames].values,
                 'y': df_i.weekly_sales.values}

In [3]:
## Structure by SKU ##
skuSet = list(df.sku.unique())
skuData = {}
colnames = [i for i in df.columns if i not in ["week","weekly_sales","sku"] ]
for i in skuSet:
    df_i = df[df.sku == i]
    skuData[i] = {'X': df_i[colnames].values,
                'y': df_i.weekly_sales.values}

## Decentralized Structure ##
X_dict = {}
y_dict = {}

skuModels = {}
y_pred = []
y_test = []
y_train = []

for i in skuSet:
  
    X_train_i,X_test_i = np.split(skuData[i]["X"], [68]) #split for X
    y_train_i,y_test_i = np.split(skuData[i]["y"], [68]) #split for y 

    X_dict[i] = {'train': X_train_i, 'test': X_test_i} #filling dictionary
    y_dict[i] = {'train': y_train_i, 'test': y_test_i}

    y_test += list(y_test_i) #creating the complete training array
    y_train += list(y_train_i) #creating the complete testing array


In [4]:
## Build Subset
X_dict_subsplit = {} 
y_dict_subsplit = {} 

y_validation = [] 
y_subtrain = [] 

for i in skuSet:

    X_train_i,X_test_i = np.split(X_dict[i]["train"], [48]) #split for X
    y_train_i,y_test_i = np.split(y_dict[i]["train"], [48]) #split for y 

    X_dict_subsplit[i] = {'train': X_train_i, 'test': X_test_i} #filling dictionary
    y_dict_subsplit[i] = {'train': y_train_i, 'test': y_test_i}

    y_validation += list(y_test_i) #creating the complete training array
    y_subtrain += list(y_train_i) #creating the complete testing array
     

#### Decision Tree

In [5]:
import random
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import r2_score

max_features_ = list(range(2,45)) 
max_depth_ = list(range(2,10))
params=[]
maximum_score=0

#selection of parameters to test
random.seed(5)
mf_ = random.choices(max_features_, k=50)
md_ = random.choices(max_depth_, k=50)

## Selection of the best model
for i in range (50):
    print('Model number:',i+1)
    #selection of parameters to test
    mf = mf_[i]
    md = md_[i]
    print('  Parameters:',[mf,md])
    #model
    y_pred = []
    for i in skuSet:
        model_i = DecisionTreeRegressor(max_features=mf,max_depth=md,random_state=0).fit(X_dict_subsplit[i]['train'] , y_dict_subsplit[i]['train'])
        y_pred += list(model_i.predict(X_dict_subsplit[i]['test']))
    score=r2_score(y_validation, np.array(y_pred))
    #compare performances on validation data
    if score>maximum_score:
        params = [mf,md]
        maximum_score = score

## Test on fresh data
mf,md = params
y_pred = []
for i in skuSet:
    model_i = DecisionTreeRegressor(max_features=mf,max_depth=md,random_state=0).fit(X_dict[i]['train'] , y_dict[i]['train'])
    y_pred += list(model_i.predict(X_dict[i]['test']))
oos_r2=r2_score(y_test, np.array(y_pred))

print('\nBest Model:')
print('Parameters:',params)
print('Validation R2:',maximum_score)
print('OOS R2:', oos_r2)

Model number: 1
  Parameters: [28, 4]
Model number: 2
  Parameters: [33, 4]
Model number: 3
  Parameters: [36, 8]
Model number: 4
  Parameters: [42, 5]
Model number: 5
  Parameters: [33, 4]
Model number: 6
  Parameters: [41, 5]
Model number: 7
  Parameters: [3, 7]
Model number: 8
  Parameters: [22, 2]
Model number: 9
  Parameters: [42, 9]
Model number: 10
  Parameters: [29, 2]
Model number: 11
  Parameters: [40, 7]
Model number: 12
  Parameters: [6, 8]
Model number: 13
  Parameters: [22, 2]
Model number: 14
  Parameters: [12, 8]
Model number: 15
  Parameters: [25, 4]
Model number: 16
  Parameters: [26, 6]
Model number: 17
  Parameters: [2, 2]
Model number: 18
  Parameters: [11, 2]
Model number: 19
  Parameters: [14, 3]
Model number: 20
  Parameters: [41, 9]
Model number: 21
  Parameters: [34, 3]
Model number: 22
  Parameters: [8, 8]
Model number: 23
  Parameters: [36, 9]
Model number: 24
  Parameters: [7, 9]
Model number: 25
  Parameters: [28, 4]
Model number: 26
  Parameters: [7, 4]
M

In [11]:
res=pd.DataFrame(index=['R2'])
res

R2


In [12]:
import time 
tZero=time.time()

y_pred = []
for i in skuSet:
    model_i = DecisionTreeRegressor(max_features=14, max_depth=4, random_state=0).fit(X_dict[i]['train'] , y_dict[i]['train'])
    y_pred += list(model_i.predict(X_dict[i]['test']))

print('OOS R2:',round(r2_score(y_test, np.array(y_pred)),3))

t = time.time()-tZero
print("Time to compute:",round(t,3)," sec")
res['decentralized-DT']=[r2_score(y_test, np.array(y_pred))]

OOS R2: 0.399
Time to compute: 0.025  sec


#### Random Forest

In [7]:
from sklearn.ensemble import RandomForestRegressor

max_features_ = list(range(2,45)) 
max_depth_ = list(range(2,10))
params=[]
maximum_score=0

#selection of parameters to test
random.seed(5)
mf_ = random.choices(max_features_, k=50)
md_ = random.choices(max_depth_, k=50)

## Iterations to select best model
for i in range (50):
    print('Model number:',i+1)
    #selection of parameters to test
    mf = mf_[i]
    md = md_[i]
    print('  Parameters:',[mf,md])
    #model
    y_pred = []
    for i in skuSet:
        model_i = RandomForestRegressor(max_features=mf, max_depth=md, random_state=42).fit(X_dict_subsplit[i]['train'] , y_dict_subsplit[i]['train'])
        y_pred += list(model_i.predict(X_dict_subsplit[i]['test']))
    score=r2_score(y_validation, np.array(y_pred))
    #compare performances on validation data
    if score > maximum_score:
        params = [mf,md]
        maximum_score = score

## Test on fresh data
mf,md=params
y_pred = []

for i in skuSet:
    model_i = RandomForestRegressor(max_features=mf, max_depth=md, random_state=42).fit(X_dict[i]['train'] , y_dict[i]['train'])
    y_pred += list(model_i.predict(X_dict[i]['test']))

oos_r2=r2_score(y_test, np.array(y_pred))

print('\nBest Model:')
print('Parameters:',params)
print('Validation R2:',maximum_score)
print('OOS R2:', oos_r2)

Model number: 1
  Parameters: [28, 4]
Model number: 2
  Parameters: [33, 4]
Model number: 3
  Parameters: [36, 8]
Model number: 4
  Parameters: [42, 5]
Model number: 5
  Parameters: [33, 4]
Model number: 6
  Parameters: [41, 5]
Model number: 7
  Parameters: [3, 7]
Model number: 8
  Parameters: [22, 2]
Model number: 9
  Parameters: [42, 9]
Model number: 10
  Parameters: [29, 2]
Model number: 11
  Parameters: [40, 7]
Model number: 12
  Parameters: [6, 8]
Model number: 13
  Parameters: [22, 2]
Model number: 14
  Parameters: [12, 8]
Model number: 15
  Parameters: [25, 4]
Model number: 16
  Parameters: [26, 6]
Model number: 17
  Parameters: [2, 2]
Model number: 18
  Parameters: [11, 2]
Model number: 19
  Parameters: [14, 3]
Model number: 20
  Parameters: [41, 9]
Model number: 21
  Parameters: [34, 3]
Model number: 22
  Parameters: [8, 8]
Model number: 23
  Parameters: [36, 9]
Model number: 24
  Parameters: [7, 9]
Model number: 25
  Parameters: [28, 4]
Model number: 26
  Parameters: [7, 4]
M

In [13]:

tZero=time.time()

y_pred = []
for i in skuSet:
    model_i = RandomForestRegressor(max_features=44,
                                  max_depth=8,
                                  random_state=0).fit(X_dict[i]['train'] , y_dict[i]['train'])
    y_pred += list(model_i.predict(X_dict[i]['test']))

print('OOS R2:',round(r2_score(y_test, np.array(y_pred)),3))

t = time.time()-tZero
print("Time to compute:",round(t,3)," sec")

res['decentralized-RF']=[r2_score(y_test, np.array(y_pred))]

OOS R2: 0.559
Time to compute: 5.632  sec


#### Gradient Boosting

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
max_features_ = list(range(2,45)) 
max_depth_ = list(range(2,10))
learning_rate_ = [0.01, 0.05, 0.1, 0.5]
params=[]
maximum_score=0

#selection of parameters to test
random.seed(5)
mf_ = random.choices(max_features_, k=50)
md_ = random.choices(max_depth_, k=50)
lr_ = random.choices(learning_rate_, k=50)

## Iterations to select best model
for i in range (50):
    print('Model number:',i+1)
    
    #selection of parameters to test
    mf = mf_[i]
    md = md_[i]
    lr = lr_[i]
    print('  Parameters:',[mf,md,lr])
    
    #model
    y_pred = []
    
    for i in skuSet:
        model_i = GradientBoostingRegressor(max_features=mf,
                                            max_depth=md,
                                            learning_rate=lr,
                                            random_state=0).fit(X_dict_subsplit[i]['train'] , y_dict_subsplit[i]['train'])
        y_pred += list(model_i.predict(X_dict_subsplit[i]['test']))
    score=r2_score(y_validation, np.array(y_pred))
    print('  R2:',score)
 
    #compare performances on validation data
    if score > maximum_score:
        params = [mf,md,lr]
        maximum_score = score

## Test on fresh data
mf,md,lr=params
y_pred = []

for i in skuSet:
    model_i = GradientBoostingRegressor(max_features=mf,
                                      max_depth=md,
                                      learning_rate=lr,
                                      random_state=0)
    
    model_i.fit(X_dict[i]['train'] , y_dict[i]['train'])
    
    y_pred += list(model_i.predict(X_dict[i]['test']))

oos_r2=r2_score(y_test, np.array(y_pred))

print('\nBest Model:')
print('Parameters:',params)
print('Validation R2:',maximum_score)
print('OOS R2:', oos_r2) 

Model number: 1
  Parameters: [28, 4, 0.5]
  R2: 0.4166304539569655
Model number: 2
  Parameters: [33, 4, 0.1]
  R2: 0.44220877874153786
Model number: 3
  Parameters: [36, 8, 0.05]
  R2: 0.43610639782904914
Model number: 4
  Parameters: [42, 5, 0.01]
  R2: 0.36258621458042184
Model number: 5
  Parameters: [33, 4, 0.1]
  R2: 0.44220877874153786
Model number: 6
  Parameters: [41, 5, 0.5]
  R2: 0.4618685984700921
Model number: 7
  Parameters: [3, 7, 0.05]
  R2: 0.23275046781581632
Model number: 8
  Parameters: [22, 2, 0.05]
  R2: 0.3554789148712222
Model number: 9
  Parameters: [42, 9, 0.01]
  R2: 0.34681921684993167
Model number: 10
  Parameters: [29, 2, 0.5]
  R2: 0.49737574680748886
Model number: 11
  Parameters: [40, 7, 0.01]
  R2: 0.33912278640721794
Model number: 12
  Parameters: [6, 8, 0.05]
  R2: 0.28029920671548325
Model number: 13
  Parameters: [22, 2, 0.5]
  R2: 0.5663441850269925
Model number: 14
  Parameters: [12, 8, 0.01]
  R2: 0.2619961812033985
Model number: 15
  Parameter

In [18]:
tZero=time.time()
y_pred = []
for i in skuSet:
    model_i = GradientBoostingRegressor(max_features=31,
                                        max_depth=4,
                                        learning_rate=0.5,
                                        random_state=0).fit(X_dict[i]['train'] , y_dict[i]['train'])
    y_pred += list(model_i.predict(X_dict[i]['test']))

print('OOS R2:',round(r2_score(y_test, np.array(y_pred)),3))


t = time.time()-tZero
print("Time to compute:",round(t,3)," sec")

res['decentralized-GB']=[r2_score(y_test, np.array(y_pred))]

OOS R2: 0.497
Time to compute: 1.877  sec


In [19]:
res

Unnamed: 0,decentralized-DT,decentralized-RF,decentralized-GB
R2,0.398779,0.558817,0.496603


# Clustering

In [20]:
import pandas as pd
df=pd.read_csv("data_processed.csv")
df 

Unnamed: 0,week,sku,weekly_sales,price,price-1,price-2,feat_main_page,trend,month_2,month_3,...,color_white,vendor_2,vendor_3,vendor_4,vendor_5,vendor_6,vendor_7,vendor_8,vendor_9,vendor_10
0,2016-11-14,1,110.0,10.24,9.86,10.16,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2016-11-21,1,127.0,8.27,10.24,9.86,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2016-11-28,1,84.0,8.83,8.27,10.24,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,2016-12-05,1,87.0,8.98,8.83,8.27,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2016-12-12,1,64.0,10.40,8.98,8.83,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4307,2018-08-27,44,20.0,53.99,42.38,43.99,0,2,0,0,...,0,0,0,0,0,1,0,0,0,0
4308,2018-09-03,44,14.0,52.99,53.99,42.38,0,2,0,0,...,0,0,0,0,0,1,0,0,0,0
4309,2018-09-10,44,22.0,44.99,52.99,53.99,1,2,0,0,...,0,0,0,0,0,1,0,0,0,0
4310,2018-09-17,44,28.0,42.99,44.99,52.99,1,2,0,0,...,0,0,0,0,0,1,0,0,0,0


In [21]:

res=pd.DataFrame(index=['R2']) #useful to gather results and visualize them
res
     

R2


## K-Means

### Just with sales and prices

In [22]:
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

In [23]:
scaler = MinMaxScaler()