In [135]:
#basic essential libraries
import pandas as pd
import numpy as np
import string
from math import sqrt
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error

from statsmodels.tsa.arima_model import ARIMA

#to ignore warning messages
import warnings
warnings.filterwarnings('ignore')
from subprocess import check_output

In [2]:
#Data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

#set visual parameters
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 10,6

In [3]:
PATH = "C:/Users/kpunyakoti/Desktop/Future/Flix/Data/traintest/"
PATH2 = "C:/Users/kpunyakoti/Desktop/Future/Flix/Data/"

In [4]:
train = pd.read_csv(f'{PATH}train.csv')
test = pd.read_csv(f'{PATH}test.csv')
dp = pd.read_csv(f'{PATH2}datapoint.csv')

In [6]:
cols_str = ['country_1', 'channel_id']
for x in cols_str:
    train[x] = train[x].astype(str)
    test[x] = test[x].astype(str)
    dp[x] = dp[x].astype(str)

In [7]:
train['cc'] = train.country_1+'_'+train.channel_id
test['cc'] = test.country_1+'_'+test.channel_id
dp['cc'] = dp.country_1+'_'+dp.channel_id

In [8]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [None]:
actual_test = test.copy()

In [None]:
test.tickets = np.nan

In [9]:
cc_minor = dp.loc[dp['rowcount']<25, 'cc']

In [10]:
for i in cc_minor.values:
    test.loc[test['cc'] == i, 'tickets'] = 0

In [11]:
cc_major = dp.loc[~dp.cc.isin(cc_minor.values), 'cc']

In [12]:
print(len(cc_minor.values))
print(len(cc_major.values))

605
207


#### Run from here safely

In [25]:
new_train = train.loc[train.cc.isin(cc_major.values)]
new_test = test.loc[test.cc.isin(cc_major.values)]

In [14]:
forecast_dates = new_test.date.unique()

In [16]:
cc_new = ['20_26', '20_28', '20_29']

## Time Series

In [18]:
def get_RSS(series, fitted_values):
    fitted_values_copy = fitted_values  # original fit is left untouched
    missing_index = list(set(series.index).difference(set(fitted_values_copy.index)))
    if missing_index:
        nan_series = pd.Series(index = pd.to_datetime(missing_index))
        fitted_values_copy = fitted_values_copy.append(nan_series)
        fitted_values_copy.sort_index(inplace = True)
        fitted_values_copy.fillna(method = 'bfill', inplace = True)  # fill holes
        fitted_values_copy.fillna(method = 'ffill', inplace = True)
    return sum((fitted_values_copy - series)**2)

In [19]:
def blackmamba(cc, train):
    tf = train.loc[train.cc == cc, ['date','tickets']]
    tf.sort_values('date', inplace=True)
    tf.set_index('date', inplace=True)
    
    df_logscale = np.log1p(tf)
    
    dflogshift = df_logscale - df_logscale.shift()
    dflogshift.dropna(inplace=True)
    
    results = arima_iter(df_logscale, dflogshift.tickets)
    
    best_pqd = min(results, key = results.get)
    best_rss, best_model = results.get(best_pqd)
    #print(best_rss)
    #print(best_model)
    
    preds_array = best_model.forecast(steps = 22, alpha = 0.05)[0]
    final_preds = np.round(np.expm1(preds_array))
    
    return final_preds

In [20]:
def arima_iter(logscale, logshift):
    results = {}
    for AR in range(0,8):
        for MA in range(0,8):
            model = ARIMA(logscale, order = (AR,1,MA))
            fit_is_available = False
            results_ARIMA = None
            try:
                results_ARIMA = model.fit(disp = -1, method = 'css')
                fit_is_available = True
            except:
                continue
            if fit_is_available:
                RSS = get_RSS(logshift, results_ARIMA.fittedvalues)
                results['%d-1-%d' % (AR,MA)]=[RSS, results_ARIMA]
    return results

In [None]:
counter = 207
for cc in cc_major.values:
    print('Forecasting cc: {}'.format(cc))
    print(counter, ' left')
    preds = blackmamba(cc,new_train)
    
    i = 0
    for d in forecast_dates:
        new_test.loc[((new_test.date == d)&(new_test.cc==cc)), 'tickets'] = preds[i]
        i+=1    
    counter -=1

### Post processing

In [99]:
new_test['tickets'] =new_test['tickets'].astype(str)

In [83]:
test.tickets.fillna(0, inplace=True)
test['tickets'] = test['tickets'].astype(int)

In [100]:
final_test = pd.merge(test, new_test, how = 'left', on = ['date', 'country_1', 'channel_id', 'cc'])

In [101]:
final_test.tickets_y.fillna(0, inplace=True)

In [104]:
final_test['tickets_y'] = final_test['tickets_y'].astype(int)
final_test['tickets'] = final_test.tickets_x+final_test.tickets_y

In [105]:
final_test.drop(['tickets_x','tickets_y'], axis = 1, inplace= True)

**Actual test set**

In [111]:
test_actual = pd.read_csv(f'{PATH}test.csv')

In [113]:
test_actual['date'] = pd.to_datetime(test_actual['date'])

In [115]:
cols_str = ['country_1', 'channel_id']
for x in cols_str:
    test_actual[x] = test_actual[x].astype(str)

In [116]:
test_actual['cc'] = test_actual.country_1+'_'+test_actual.channel_id
cc_all = test_actual.cc.unique()
len(cc_all)

In [133]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))
def smape(y, yhat):
    return 100 * np.mean(2 * np.abs(y - yhat)/(np.abs(y) + np.abs(yhat)))


In [132]:
ticks = test_actual.tickets
preds = final_test.tickets

In [136]:
error = rmspe(ticks, preds)
smap = smape(ticks, preds)
rmse = sqrt(mse(ticks, preds))
print('RMSPE: {:.4f}'.format(error))
print('SMAPE: {:.4f}'.format(smap))
print('RMSE: {:4f}'.format(rmse))
print('MAE: {:.4f} '.format(mean_absolute_error(ticks, preds)))

RMSPE: inf
SMAPE: 83.9071
RMSE: 8.999527
MAE: 1.0607 


### CC_Metrics

In [151]:
def cc_met(cc, train):
    tf = train.loc[train.cc == cc, ['date','tickets']]
    tf.sort_values('date', inplace=True)
    tf.set_index('date', inplace=True)
    
    df_logscale = np.log1p(tf)
    
    dflogshift = df_logscale - df_logscale.shift()
    dflogshift.dropna(inplace=True)
    
    best_pdq, best_rss = cc_arima(df_logscale, dflogshift.tickets)
    
    return best_pdq, best_rss

In [150]:
def cc_arima(logscale, logshift):
    results = {}
    for AR in range(0,8):
        for MA in range(0,8):
            model = ARIMA(logscale, order = (AR,1,MA))
            fit_is_available = False
            results_ARIMA = None
            try:
                results_ARIMA = model.fit(disp = -1, method = 'css')
                fit_is_available = True
            except:
                continue
            if fit_is_available:
                RSS = get_RSS(logshift, results_ARIMA.fittedvalues)
                results['%d-1-%d' % (AR,MA)]=[RSS, results_ARIMA]
    best_pdq = min(results, key = results.get)
    best_rss, best_model = results.get(best_pdq)
    
    return best_pdq, best_rss

In [None]:
df_cols = ['country', 'channel', 'best_p-d-q', 'best_RSS', 'rmse', 'mae', 'smape']
cc_metrics = pd.DataFrame(columns=df_cols)
row_index=0
for cc in cc_major.values:
    country, channel = cc.split('_')
    pdq, rss = cc_met(cc,new_train)
    
    cc_metrics.loc[row_index, 'country'] = country
    cc_metrics.loc[row_index, 'channel'] = channel
    cc_metrics.loc[row_index, 'best_p-d-q'] = pdq
    cc_metrics.loc[row_index, 'best_RSS'] = rss
    
    actuals = test_actual.loc[test_actual.cc == cc].tickets
    predicted = final_test.loc[final_test.cc == cc].tickets
    
    smap = smape(actuals, predicted)
    rmse = sqrt(mse(actuals, predicted))
    mae = mean_absolute_error(actuals, predicted)
    
    cc_metrics.loc[row_index, 'rmse'] = rmse
    cc_metrics.loc[row_index, 'smap'] = smap
    cc_metrics.loc[row_index, 'mae'] = mae
    
    row_index +=1
    print(row_index,'|', country,'|', channel,'|', pdq,'|', rss,'|', rmse,'|', smap,'|', mae)

In [171]:
cc_metrics.loc[((cc_metrics.country =='5') & (cc_metrics.channel=='35'))]

Unnamed: 0,country,channel,best_p-d-q,best_RSS,rmse,mae,smape
135,5,35,0-1-1,144.376,3.80789,3.04545,51.855196


In [158]:
cc_metrics.drop('smape', axis=1, inplace=True)

In [159]:
cc_metrics.rename(columns = {"smap":"smape"}, inplace=True)

In [161]:
cc_metrics.to_csv(f'{PATH}cc_metrics.csv')

In [154]:
df_cols2 = ['country', 'channel']
cc_m = pd.DataFrame(columns=df_cols2)
row_index=0
for cc in cc_major.values:
    country, channel = cc.split('_')    
    cc_m.loc[row_index, 'country'] = country
    cc_m.loc[row_index, 'channel'] = channel
    row_index +=1