In [305]:
import numpy as np
import pandas as pd

import datetime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


from scipy import stats
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
import statsmodels
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm
from statsmodels.formula.api import ols

import matplotlib.pyplot as plt 
import seaborn as sns
from pandas import Timestamp

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR

import xgboost as xgb
from xgboost import XGBRegressor 
from xgboost.sklearn import XGBClassifier # sklearn’s Grid Search with parallel processing
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from catboost import CatBoostRegressor

# import warnings
# warnings.filterwarnings('ignore')

In [306]:
data = pd.read_csv('DF_File_sample.csv')
data['SKU_Customer'] = data['DemandCustomer'] + data['SKU10']
data.drop(['DemandCustomer', 'SKU10'], axis =1, inplace = True)

In [25]:
data.dtypes

Brand           category
Year               int64
Quarter            int64
Month_No           int64
Week_No            int64
Sales            float64
W_Nielsen        float64
SKU_Customer    category
dtype: object

In [268]:
cat_var = [data.columns.get_loc(c) for c in data.columns if data.loc[:, c].dtypes=='object']

In [307]:
le = preprocessing.LabelEncoder()

for i in cat_var:
    data.iloc[:, i] = le.fit_transform(data.iloc[:, i])

In [283]:
for c in cat_var:
    data.iloc[:, c] = pd.Categorical(data.iloc[:,c])

In [308]:
data['year_week'] = data['Year'].astype(str) + '-' + data['Week_No'].astype(str)
data['pre_date'] = data['year_week'].apply(lambda x: datetime.datetime.strptime(x + '-4',  "%G-%V-%w"))

In [309]:
first_null = data.groupby('pre_date').Sales.sum().loc[lambda x: x == 0].sort_values().index[0]
data = data[data.pre_date < first_null]

In [279]:
first_26_week = pd.Series(sorted(data['pre_date'].unique())).iloc[-26]

In [290]:
first_34_week = pd.Series(sorted(data['pre_date'].unique())).iloc[-34]

In [310]:
train = data[data['pre_date'] < first_34_week]
test = data[data['pre_date'] >= first_26_week]
data.drop(['pre_date', 'year_week'], axis =1, inplace = True)
train.drop(['pre_date', 'year_week'], axis =1, inplace = True)
test.drop(['pre_date', 'year_week'], axis =1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [311]:
X_train, X_test, y_train, y_test = train_test_split(train.loc[:, train.columns!='Sales'], train['Sales'], test_size=0.33, random_state=42)

In [293]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(train.loc[:, train.columns!='Sales'], train['Sales'], test_size=0.33, random_state=42)

In [312]:
norm_scaler = preprocessing.StandardScaler()
min_max_scaler = preprocessing.MinMaxScaler() # [0, 1]
max_abs_scaler = preprocessing.MaxAbsScaler() # [-1, 1]

X_train_scaled = norm_scaler.fit_transform(X_train)
X_test_scaled = norm_scaler.transform(X_test)

In [255]:
def scale_by_col(df, scaler):
    for var in df.select_dtypes(['number']).columns:
        df[var] = scaler.fit_transform(df[var].values.reshape(-1,1))
    return df

In [294]:
X_train_с_scaled = scale_by_col(X_train_c, norm_scaler)
X_test_c_scaled = scale_by_col(X_test_c, norm_scaler)

In [257]:
X_train_c_scaled.dtypes

Brand           category
Year             float64
Quarter          float64
Month_No         float64
Week_No          float64
W_Nielsen        float64
SKU_Customer    category
dtype: object

In [258]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return (np.mean(np.abs((y_true - y_pred))) * 100 / np.mean(np.abs((y_true)))) 

# Modeling:

## Lightgbm(mape):

In [318]:
param_grid = {
    'n_estimators' : list(range(140, 160,5)),
    'num_leaves': list(range(7,10, 1)),
    # 'min_data_in_leaf': [10, 20, 40, 60, 100],
    'max_depth': [2, 4, 6, 7, 8],
    'learning_rate': [0.14, .15,0.16]}
    # 'bagging_freq': [3, 4, 5, 6, 7],
    # 'bagging_fraction': np.linspace(0.6, 0.95, 10),
    # 'reg_alpha': np.linspace(0.1, 0.95, 10),
    # 'reg_lambda': np.linspace(0.1, 0.95, 10)
                                                  
scorer = make_scorer(mean_absolute_percentage_error, greater_is_better = False)   
gbm = GridSearchCV(lgb.LGBMRegressor(cat_features= cat_var), 
                   param_grid, 
                   scoring = scorer, n_jobs = -2)


gbm.fit(X_train_scaled, y_train)

print("lightgbm mape:", mean_absolute_percentage_error(y_test, gbm.predict(X_test_scaled)), gbm.best_params_)



lightgbm mape: 49.66984845372074 {'learning_rate': 0.15, 'max_depth': 7, 'n_estimators': 155, 'num_leaves': 9}


### Tunning other parameters:

In [321]:
param_grid = {
    'n_estimators' : [155],
    'num_leaves': [9],
    'min_data_in_leaf': [9, 10, 11, 12, 13],
    'max_depth': [7],
    'learning_rate': [.15],
    # 'bagging_freq': [3, 4, 5, 6, 7],
    # 'bagging_fraction': np.linspace(0.6, 0.95, 10),
    'reg_alpha': np.linspace(0.1, 1, 10),
    'reg_lambda': np.linspace(0.1, 0.95, 10)}
                                                  
scorer = make_scorer(mean_absolute_percentage_error, greater_is_better = False)   
gbm = GridSearchCV(lgb.LGBMRegressor(cat_features= cat_var), 
                   param_grid, 
                   scoring = scorer, n_jobs = -2)


gbm.fit(X_train_scaled, y_train)

print("lightgbm mape:", mean_absolute_percentage_error(y_test, gbm.predict(X_test_scaled)), gbm.best_params_)



lightgbm mape: 49.59330549097811 {'learning_rate': 0.15, 'max_depth': 7, 'min_data_in_leaf': 10, 'n_estimators': 155, 'num_leaves': 9, 'reg_alpha': 1.0, 'reg_lambda': 0.6666666666666666}
