In [1]:
import itertools
import time
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")
import logging
logging.getLogger('prophet').setLevel(logging.WARNING)
from millify import millify

In [2]:
df = pd.read_csv("item_orders.csv")

In [3]:
def get_time_series(df_in, filter_in, frequency = "Daily", fill_na = True):
    
    """
    Returns the time series dataframe for given filter, frequency,  and imput dataframe
    
    df_in = Pandas dataframe
    filter_in = filter that will be applied on df_in dataframe. Example: df_in.group1 == "Running shoes"
    frequency = Daily, Weekly or Monthly
    """        
    
    df_temp = df_in[filter_in]
    
    if(df_temp.shape[0] == 0): #Checking if df is empty and returning it to prevent errors in retrieving the serie
        return(df_temp["quantity"])
    
    start_date = min(df_temp.date)[0:10]
    end_date = max(df_in.date)[0:10]  
    
    if frequency == "Daily":
        idx = pd.date_range(start_date, end_date, freq="D")
        df_temp['date']=pd.to_datetime(df_temp['date'], format = "%Y-%m-%d")
        df_out = df_temp.groupby("date").sum()
    
    if frequency == "Weekly":
        idx = pd.date_range(start_date, end_date, freq='W-SUN')
        date=pd.to_datetime('2019-06-01')
        df_temp['date']=pd.to_datetime(df_temp['date'], format = "%Y-%m-%d")
        df_out = df_temp.resample('W-{:%a}'.format(date), on='date').sum()
        df_out = df_out.iloc[1: , :] # dropping the first row as it is an incomplete week
    
    if frequency == "Monthly":
        idx = pd.date_range(start_date, end_date, freq="M")
        date = pd.to_datetime('2019-06-01')
        df_temp['date'] = pd.to_datetime(df_temp['date'], format = "%Y-%m-%d")
        df_out = df_temp.resample('M'.format(date), on='date').sum()
        
    
    df_out = df_out["quantity"]
    if(fill_na):
        df_out = df_out.reindex(idx, fill_value=0)
    
    return(df_out)
    

In [4]:
df.head()

Unnamed: 0,order_id,date,item_code,unit_price_vat_excl,quantity,department,item_name,name,group1,country
0,2000093387,2020-04-24 00:00:00,S101,3.506048,1,E-COMMER,Dopravné,- žádný výrobce -,,Hungary
1,2000093391,2020-04-24 00:00:00,S101,3.737403,1,E-COMMER,Dopravné,- žádný výrobce -,,Slovakia
2,2000093394,2020-04-24 00:00:00,S101,3.171318,1,E-COMMER,Dopravné,- žádný výrobce -,,Czech Republic
3,2000093395,2020-04-24 00:00:00,S101,2.850775,1,E-COMMER,Dopravné,- žádný výrobce -,,Czech Republic
4,2000093400,2020-04-24 00:00:00,S101,3.336224,1,E-COMMER,Dopravné,- žádný výrobce -,,Romania


In [5]:
sales_counts = df["group1"].value_counts()
interest_items = list(sales_counts.index[sales_counts.index != "nan"][0:10])

In [6]:
interest_items.append("All items")

In [7]:
sales_counts[sales_counts.index.isin(interest_items)].sum()/df.shape[0]

0.6585047821726183

65% of sales fall into one of these categories

In [8]:
interest_items

['Running shoes',
 'Pants',
 'T-Shirts',
 'Football shoes',
 'Socks',
 'Other Footwear',
 'Sweatshirts',
 'Jackets',
 'Jerseys',
 'Fitness Shoes',
 'All items']

In [9]:
countries = list(df["country"].value_counts().index)
countries.append("All countries")
#countries.remove("Other") # some items under Other countries have so little observations that they are causing errors in the model
countries.remove("Croatia")
countries

['Czech Republic',
 'Slovakia',
 'Romania',
 'Hungary',
 'Germany',
 'Spain',
 'France',
 'Italy',
 'Austria',
 'Other',
 'All countries']

## Model fitting

In [10]:
Fill_na = True #Fill na's in the series? If set to True will fill all the missing with 0s. Assuming if there isn't an order we did not sell any items

# Dates for filtering predictions and cut
today = datetime.today()
tomorrow = today + timedelta(days=1)
tomorrow = tomorrow.date()
last_day_week = today + timedelta(days=5 - today.weekday())
first_day_week = last_day_week - timedelta(days=6)
fist_d_next_week = last_day_week + timedelta(days=1)
last_d_next_week = last_day_week + timedelta(days=7)
first_next_month = datetime(year=(today.year + int(today.month % 12 == 0)), month=(today.month + 1) % 12, day=1)
last_next_month = datetime(year=(today.year + int(today.month % 12 == 0)), month=(today.month + 2) % 12, day=1) - timedelta(days=1)
first_this_month = datetime(year=(today.year + int(today.month % 12 == 0)), month=(today.month) % 12, day=1)
last_this_month = first_next_month -timedelta(days=1)
today = today.date()

cut = [max(pd.to_datetime(df.date))- timedelta(days=30)]

# Metrics I will calculate
product_list = []
country_list = []
mape_baseline = []
mape_best = []
forecast_tomorrow = []
forecast_next_week = []
forecast_next_month = []
day_increase = []
week_increase = []
month_increase = []
df_output_predictions = pd.DataFrame()

# Counting iterations 
i=1
total_it = int(len(interest_items)*len(countries))

#number of days to keep on serie to plot. Ex: 60 will plot the previous 60 days + days_ahead below
n_days = 60
plot_start_date = today - timedelta(days=n_days)

# Parameters to hypertune
param_grid = {'changepoint_prior_scale': [0.01, 0.05, 0.1],
              'seasonality_prior_scale': [1.0, 10.0, 100.0],
              'seasonality_mode': ['additive', 'multiplicative']}
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]


start_time = time.time()

for country in countries:
    for item in interest_items: 
        
        print(f"Iteration {i} of {total_it}.") 
        
        if(country == "All countries" and item == "All items"):
            filter_in = np.ones(df.shape[0], dtype=bool)
        elif(country == "All countries"):
            filter_in = df["group1"] == item
        elif(item == "All items"):
            filter_in = df["country"] == country      
        else:
            filter_in = (df["country"] == country) & (df["group1"] == item)
        
        #creating temporary time series
        df_temp = get_time_series(df, filter_in = filter_in, frequency = "Daily", fill_na = False)
        df_temp.bfill(axis ='rows')
        df_temp = df_temp.reset_index(drop=False)
        df_temp.columns = ["ds", "y"]

        if(df_temp.shape[0] < 50):
            break
        
                
        country_list.append(country)
        product_list.append(item)
        
        try:
            #hyperparameter tunning
            mapes_temp = [] #select models based on mape

            for params in all_params:
                model = Prophet(**params).fit(df_temp)
                df_cv = cross_validation(model, horizon='30 days', cutoffs = cut, parallel="processes")
                df_p = performance_metrics(df_cv, rolling_window=1)
                mapes_temp.append(df_p['mape'].values[0])

            # Find the best parameters
            tuning_results = pd.DataFrame(all_params)
            tuning_results['mape'] = mapes_temp
            best_params = all_params[np.argmin(mapes_temp)]

            # Updating metrics list
            mape_baseline.append(str(round(tuning_results.at[8,"mape"],2)))
            mape_best.append(str(round(tuning_results.at[np.argmin(mapes_temp),"mape"],2)))

            #fit model with best parameters again
            model = Prophet(**best_params).fit(df_temp)
        
        except:
            #handling the case where the data does not provide enough points to k-fold
            print("Excepted an error")
            model = Prophet().fit(df_temp)
            future_except = model.make_future_dataframe(periods=0, include_history=True)
            forecast_except = model.predict(future_except)
            y_hat = forecast_except.tail(30).yhat
            mape_except = mean_absolute_percentage_error(df_temp.tail(30).y, y_hat)
            mape_baseline.append(round(mape_except,2))
            mape_best.append(round(mape_except,2))
            
        #Forecasting
        future = model.make_future_dataframe(periods=70, include_history=True)
        forecast = model.predict(future)
        
        try:
            tomorrow_for_temp = forecast[forecast["ds"] == tomorrow.strftime('%Y-%m-%d')].yhat.values[0]
            forecast_tomorrow.append(tomorrow_for_temp)
            day_increase.append(str(int((1 - int(forecast[forecast["ds"] == today.strftime('%Y-%m-%d')].yhat)/tomorrow_for_temp)*100))+"%")
        except:
            tomorrow_for_temp = "nan"
            forecast_tomorrow.append("nan")
            day_increase.append("nan")
            
        
        next_week_temp = forecast[(forecast["ds"]>=fist_d_next_week) & (forecast["ds"]<=last_d_next_week)].yhat.sum()
        forecast_next_week.append(round(next_week_temp,0))
        
        this_week_temp = forecast[(forecast["ds"]>=first_day_week) & (forecast["ds"]<=last_day_week)].yhat.sum()
        week_increase.append(str(int((1 - this_week_temp/next_week_temp)*100))+"%")
        
        next_month_temp = forecast[(forecast["ds"]>=first_next_month) & (forecast["ds"]<=last_next_month)].yhat.sum()
        forecast_next_month.append(round(next_month_temp,0))
        
        this_month_temp = forecast[(forecast["ds"]>=first_this_month) & (forecast["ds"]<=last_this_month)].yhat.sum()
        month_increase.append(str(int((1 - this_month_temp/next_month_temp)*100))+"%")
        
        #merging to get observed values
        forecast = pd.concat([forecast, df_temp["y"]], axis=1)
        
        #keeping only the columns I need
        forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper', "y"]]

        #How many days of the forecast to keep for the plot?
        #forecast = forecast[(forecast["ds"] >= plot_start_date) & (forecast["ds"] <= last_day_week)]
        
        n_rows = forecast.shape[0]
        forecast["item"] = [item]*n_rows
        forecast["country"] = [country]*n_rows
        
        df_output_predictions = df_output_predictions.append(forecast, ignore_index=True)
        
        i += 1
        
elapsed_time = time.time() - start_time
print(f"Elapsed time = {elapsed_time}")

Iteration 1 of 121.
Iteration 2 of 121.
Iteration 3 of 121.
Iteration 4 of 121.
Iteration 5 of 121.
Iteration 6 of 121.
Iteration 7 of 121.
Iteration 8 of 121.
Iteration 9 of 121.
Iteration 10 of 121.
Iteration 11 of 121.
Iteration 12 of 121.
Iteration 13 of 121.
Iteration 14 of 121.
Iteration 15 of 121.
Iteration 16 of 121.
Iteration 17 of 121.
Iteration 18 of 121.
Iteration 19 of 121.
Iteration 20 of 121.
Iteration 21 of 121.
Iteration 22 of 121.
Iteration 23 of 121.
Iteration 24 of 121.
Iteration 25 of 121.
Iteration 26 of 121.
Iteration 27 of 121.
Iteration 28 of 121.
Iteration 29 of 121.
Iteration 30 of 121.
Iteration 31 of 121.
Iteration 32 of 121.
Iteration 33 of 121.
Iteration 34 of 121.
Iteration 35 of 121.
Iteration 36 of 121.
Iteration 37 of 121.
Iteration 38 of 121.
Iteration 39 of 121.
Iteration 40 of 121.
Iteration 41 of 121.
Iteration 42 of 121.
Iteration 43 of 121.
Iteration 44 of 121.
Iteration 45 of 121.
Iteration 46 of 121.
Iteration 47 of 121.
Iteration 48 of 121.
I

In [11]:
df_metrics = pd.DataFrame({
    "product":product_list,
    "country":country_list,
    "mape_baseline":mape_baseline,
    "mape_best":mape_best,
    "tomorrow":forecast_tomorrow,
    "next_week":forecast_next_week,
    "next_month":forecast_next_month,
    "day_increase":day_increase,
    "week_increase":week_increase,
    "month_increase":month_increase
})

In [12]:
df_metrics

Unnamed: 0,product,country,mape_baseline,mape_best,tomorrow,next_week,next_month,day_increase,week_increase,month_increase
0,Running shoes,Czech Republic,0.27,0.27,55.611640,290.0,1596.0,-6%,-7%,-9%
1,Pants,Czech Republic,0.33,0.32,65.589659,319.0,1600.0,-5%,-15%,-28%
2,T-Shirts,Czech Republic,0.36,0.33,42.355545,210.0,1111.0,-6%,-14%,-29%
3,Football shoes,Czech Republic,0.26,0.26,44.544905,214.0,1123.0,-5%,-12%,-17%
4,Socks,Czech Republic,1.32,0.6,117.098194,530.0,2595.0,-3%,-24%,-23%
...,...,...,...,...,...,...,...,...,...,...
97,Sweatshirts,All countries,0.33,0.22,125.239738,655.0,2968.0,-1%,-10%,-25%
98,Jackets,All countries,0.32,0.21,97.310906,484.0,2148.0,1%,-16%,-38%
99,Jerseys,All countries,0.64,0.39,110.544557,527.0,2311.0,4%,-14%,-16%
100,Fitness Shoes,All countries,0.28,0.26,52.913037,285.0,1386.0,0%,-7%,-20%


In [13]:
df_metrics.to_csv("df_metrics_tuned_2.csv")

In [14]:
df_output_predictions

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper,y,item,country
0,2019-06-01,30.444337,5.746685,55.579954,3.0,Running shoes,Czech Republic
1,2019-06-03,50.872299,26.486453,74.424940,2.0,Running shoes,Czech Republic
2,2019-06-04,44.570893,20.131205,68.238773,71.0,Running shoes,Czech Republic
3,2019-06-05,40.891288,16.370951,63.220576,69.0,Running shoes,Czech Republic
4,2019-06-06,34.193648,11.116683,56.796921,21.0,Running shoes,Czech Republic
...,...,...,...,...,...,...,...
92035,2022-03-20,3015.910113,2529.023504,3534.412118,,All items,All countries
92036,2022-03-21,3169.808566,2664.989145,3659.771964,,All items,All countries
92037,2022-03-22,2925.319795,2427.578590,3453.274768,,All items,All countries
92038,2022-03-23,2848.160745,2297.297027,3357.129668,,All items,All countries


In [15]:
df_output_predictions.to_csv("df_output_predictions_tuned_2.csv")

In [16]:
df_temp[df_temp.y == 0]

Unnamed: 0,ds,y
