In [7]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

  from pandas.core import datetools


In [8]:
import datetime as dt
import psycopg2 as pg
import pandas.io.sql as pd_sql

from sqlalchemy import create_engine
import psycopg2 
import io


In [13]:
def get_sarima_prediction(deptID):
    
    result = pd.DataFrame()
    
    storeIDs =[i for i in range(1,46)]
    
    sales_data_file = 'data/clean_sales_dept_' + str(deptID) + '.pkl'
    all_stores_sales_df = pd.read_pickle(sales_data_file)
    idx = sorted(all_stores_sales_df['Date'].unique())
    
    sarima_best_prams_file = 'data/sarima_best_params_dept_' + str(deptID) + '.pkl'
    best_params_dict = pickle.load(open(sarima_best_prams_file, "rb"))
    
    for storeID in storeIDs:
        print('dept= ',deptID,' store= ',storeID)
        
        try:
            sales_df = all_stores_sales_df[all_stores_sales_df['Store']==storeID]
            sales = sales_df['Weekly_Sales']
            best_params = best_params_dict[storeID]
        
            scaler = StandardScaler()
            scaler.fit(sales.values.reshape(-1, 1))
            X = scaler.transform(sales.values.reshape(-1, 1))

            mod = sm.tsa.statespace.SARIMAX(
                X,
                order = best_params[0],
                seasonal_order = best_params[1],
                enforce_stationarity=False,
                enforce_invertibility=False)
            res = mod.fit()

            predicted_scaled_sales = res.predict(0)
        
            predicted_scaled_sales = res.predict(0)
            predicted_scaled_sales=pd.Series(predicted_scaled_sales)
            predicted_scaled_sales.index=idx
            predicted_scaled_sales_df = predicted_scaled_sales.to_frame().reset_index()
            predicted_scaled_sales_df.columns=['Date','sarima_predicted_scaled_sales']
        
            n_rows = predicted_scaled_sales_df.shape[0]
            predicted_scaled_sales_df['Store']=[storeID]*n_rows
            predicted_scaled_sales_df['Dept']=[deptID]*n_rows
        except:
            print('failed to get prediction for dept= ',deptID,' store= ',storeID)
            continue
        if storeID == 1:
            result = predicted_scaled_sales_df
        else:
            result = pd.concat([result,predicted_scaled_sales_df])
            
    pickle.dump(result, open('data/sarima_prediction_df_dept_' + str(deptID) + '.pkl', "wb"))
    return 

In [9]:
df1 = pickle.load(open('data/sarima_prediction_df_dept_1.pkl', "rb"))

In [None]:
for deptID in [i for i in range(2,82)]:
    try:
        get_sarima_prediction(deptID)
    except:
        print('failed to process dept ',deptID)
        continue

dept=  2  store=  1
dept=  2  store=  2
dept=  2  store=  3
dept=  2  store=  4
dept=  2  store=  5
dept=  2  store=  6
dept=  2  store=  7
dept=  2  store=  8
dept=  2  store=  9
dept=  2  store=  10
dept=  2  store=  11
dept=  2  store=  12
dept=  2  store=  13
dept=  2  store=  14
dept=  2  store=  15
dept=  2  store=  16


In [10]:
for deptID in [i for i in range(2,82)]:
    try:
        df2=pickle.load(open('data/sarima_prediction_df_dept_'+str(deptID)+'.pkl', "rb"))
        df1=pd.concat([df1,df2])
    except:
        #print('failed to process dept ',deptID)
        continue

In [11]:
df1.head()

Unnamed: 0,Date,sarima_predicted_scaled_sales,Store,Dept
0,2010-01-04,0.0,1,1
1,2010-01-11,-0.006817,1,1
2,2010-01-18,-0.00695,1,1
3,2010-01-25,-0.006952,1,1
4,2010-02-01,-0.006952,1,1


In [64]:
def date_to_string(d):
    return d.strftime('%Y-%m-%d')

In [12]:
def get_year(d):
    return d.year
def get_month(d):
    return d.month
def get_day(d):
    return d.day

In [13]:
df = df1.copy(deep=True)
df['year']=df['Date'].apply(get_year)
df['month']=df['Date'].apply(get_month)
df['day']=df['Date'].apply(get_day)
df.head()

Unnamed: 0,Date,sarima_predicted_scaled_sales,Store,Dept,year,month,day
0,2010-01-04,0.0,1,1,2010,1,4
1,2010-01-11,-0.006817,1,1,2010,1,11
2,2010-01-18,-0.00695,1,1,2010,1,18
3,2010-01-25,-0.006952,1,1,2010,1,25
4,2010-02-01,-0.006952,1,1,2010,2,1


In [14]:
df.drop(['Date'],axis=1,inplace=True)
df.head()

Unnamed: 0,sarima_predicted_scaled_sales,Store,Dept,year,month,day
0,0.0,1,1,2010,1,4
1,-0.006817,1,1,2010,1,11
2,-0.00695,1,1,2010,1,18
3,-0.006952,1,1,2010,1,25
4,-0.006952,1,1,2010,2,1


In [15]:
pickle.dump(df, open('data/sarima_prediction_df_all_dept.pkl', "wb"))

In [16]:
df.columns

Index(['sarima_predicted_scaled_sales', 'Store', 'Dept', 'year', 'month',
       'day'],
      dtype='object')

In [17]:
connection_args = {
    'host': '34.220.165.163', # current aws instance public IP
    'user': 'ubuntu',    
    'dbname': 'ubuntu',   
    'port': 5432         
}

In [18]:
connection = pg.connect(**connection_args)

In [19]:
cur = connection.cursor()

In [21]:
cur.execute("""CREATE TABLE sarima_prediction (
    sarima_predicted_scaled_sales float, 
    Dept int,
    Store int,
    year int,
    month int,
    day int
  
);""")

In [22]:
output = io.StringIO()
df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, 'sarima_prediction', null="") #null values become ''   
connection.commit()

In [23]:
test = pd.read_sql_query('select * from sarima_prediction where store=1 and dept=1 and year=2012 and month=11 and day= 19 ',con=connection)
test

Unnamed: 0,sarima_predicted_scaled_sales,dept,store,year,month,day
0,-0.347483,1,1,2012,11,19
