### Import

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import holidays
import itertools

from prophet import Prophet
from prophet.plot import add_changepoints_to_plot
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics

  from .autonotebook import tqdm as notebook_tqdm


- Train 데이터의 기간이 22년 1월 1일 ~ 23년 4월 24일까지의 일별 판매량 데이터
- Test 데이터의 기간이 23년 4월 25일 ~ 23년 5월 15일까지로, 향후 21일의 일별 판매량을 예측

In [2]:
DATA_PATH = "../data" # 학습에 사용할 csv 파일이 저장된 폴더입니다.
TRAIN_FILE = "train.csv" # 학습 및 예측에 사용할 파일입니다.
TRAIN_PATH = os.path.join(DATA_PATH, TRAIN_FILE)
data = pd.read_csv(TRAIN_PATH)

df_brand = pd.read_csv(os.path.join(DATA_PATH, 'brand_keyword_cnt.csv'))
df_info = pd.read_csv(os.path.join(DATA_PATH, 'product_info.csv'))
df_sales = pd.read_csv(os.path.join(DATA_PATH, 'sales.csv'))
df_smp = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
df_data = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))

In [3]:
m_data = df_data.melt(id_vars=['ID', '제품', '대분류', '중분류', '소분류', '브랜드', '쇼핑몰'],
                var_name='ds', value_name='y', ignore_index=True)

In [4]:
m_data.head(3)

Unnamed: 0,ID,제품,대분류,중분류,소분류,브랜드,쇼핑몰,ds,y
0,SAMPLE_00000,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,S001-00001,2022-01-01,0
1,SAMPLE_00001,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,2022-01-01,0
2,SAMPLE_00002,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,2022-01-01,0


### 함수 정의

In [5]:
def train_test_split(train_s, date :str):

    val_s = train_s[train_s.ds >= date]
    train_s = train_s[train_s.ds < date]
    
    train_s['ds'] = pd.to_datetime(train_s['ds'])
    val_s['ds'] = pd.to_datetime(val_s['ds'])

    return train_s, val_s

In [6]:
def make_model():
    
    m = Prophet()
    
    m.add_seasonality(name='yearly', period=365.25, fourier_order=10, prior_scale=10, mode='additive')
    m.add_seasonality(name='monthly', period=30.5, fourier_order=5, prior_scale=10, mode='additive')
    m.add_seasonality(name='weekly', period=7, fourier_order=3, prior_scale=10, mode='additive')
    
    return m

In [7]:
def mae():
    
    t = (forecast[forecast.ds >= '2023-01-01']
         .loc[:, ['ds', 'yhat']]
         .merge(val_s)
    )

    t['yhat'] = t.apply(lambda x: x.yhat -2, axis=1)
    t['se'] = np.abs(t.yhat - t.y)
    
    return t['se'].mean()

In [8]:
def plot_forecast():
    
    t = (forecast
        .loc[:, ['ds', 'yhat']]
        .merge(train_s)
    )
    
    t['yhat'] = t.apply(lambda x: x.yhat -2, axis=1)
    t['se'] = np.abs(t.yhat - t.y)

    plt.scatter(t['ds'], t['y'], s=10)
    plt.scatter(t['ds'], t['yhat'], s=10)

### Prophet simple model - test

In [9]:
def ph_train(df):
    pred_list = []  
    for code in tqdm(df['ID'].unique()):
        d = df[df['ID'] == code].loc[:, ['ds', 'y']].reset_index().drop(['index'], axis=1).sort_values('ds')
        
        # 모델 생성, 학습
        m = make_model()
        # m.fit(train_s)
        if d.shape[0] > 2:
            m.fit(d)
        else:
            pass

        future = pd.DataFrame()
        future['ds'] = pd.date_range(start='2023-04-25', periods=21)
        forecast = m.predict(future)
        forecast.yhat = np.where(forecast.yhat < 0, 0, forecast.yhat)

        # 데이터프레임 형식으로 concat하기
        df = forecast[forecast.ds >= '2023-04-25'].loc[:, ['ds', 'yhat']].T
        df = df.drop(df.index[0])
        df = df.round(0).astype(int)
        pred = pred_list.append(df)
    
    predict_df = pd.concat(pred, ignore_index=True)
    return predict_df

predict = ph_train(m_data)
predict.to_csv('../submit/submission.csv', header=True, index=False)
    

  0%|          | 0/28894 [00:00<?, ?it/s]00:20:33 - cmdstanpy - INFO - Chain [1] start processing
00:20:33 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 1/28894 [00:00<4:41:02,  1.71it/s]


KeyError: 'ID'

In [None]:
# ID에 해당하는 데이터프레임에서 필요한 컬럼만 가져오기
train_s = train_m.loc[train_m['ID'] == i].loc[:, ['ds', 'y']]

In [10]:

pred_list = []  
for code in tqdm(m_data['ID'].unique()):
    d = m_data[m_data['ID'] == code].loc[:, ['ds', 'y']].reset_index().drop(['index'], axis=1).sort_values('ds')
        
    # 모델 생성, 학습
    m = make_model()
    # m.fit(train_s)
    if d.shape[0] > 2:
        m.fit(d)
    else:
        pass

    future = pd.DataFrame()
    future['ds'] = pd.date_range(start='2023-04-25', periods=21)
    forecast = m.predict(future)
    forecast.yhat = np.where(forecast.yhat < 0, 0, forecast.yhat)

    # 데이터프레임 형식으로 concat하기
    df = forecast[forecast.ds >= '2023-04-25'].loc[:, ['ds', 'yhat']].T
    df = df.drop(df.index[0])
    df = df.round(0).astype(int)
    pred_list.append(df)

predict_df = pd.concat(pred_list, ignore_index=True)
predict_df
    
        

  0%|          | 0/28894 [00:00<?, ?it/s]00:21:56 - cmdstanpy - INFO - Chain [1] start processing
00:21:56 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 1/28894 [00:00<4:26:22,  1.81it/s]00:21:56 - cmdstanpy - INFO - Chain [1] start processing
00:21:56 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 2/28894 [00:01<4:20:25,  1.85it/s]00:21:57 - cmdstanpy - INFO - Chain [1] start processing
00:21:57 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 3/28894 [00:01<4:11:51,  1.91it/s]00:21:57 - cmdstanpy - INFO - Chain [1] start processing
00:21:57 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 4/28894 [00:02<4:10:16,  1.92it/s]00:21:58 - cmdstanpy - INFO - Chain [1] start processing
00:21:58 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 5/28894 [00:02<4:08:24,  1.94it/s]00:21:58 - cmdstanpy - INFO - Chain [1] start processing
00:21:58 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 6/28894 [

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,2,2,2,3,2,3,3,3,3,3,...,3,3,3,3,3,3,3,2,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,32,41,33,35,31,27,39,34,40,31,...,44,52,69,64,63,46,46,48,52,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28889,0,0,0,0,0,0,0,0,0,0,...,1,2,2,4,5,7,9,11,12,12
28890,0,0,0,3,17,25,13,0,2,16,...,41,45,30,14,16,29,45,61,74,69
28891,0,4,7,7,7,4,3,3,6,10,...,13,11,8,6,7,9,10,11,7,3
28892,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# concat 데이터 csv파일로 저장하기
predict_df.to_csv('../submit/submission.csv', header=True, index=False)

In [None]:
EXAMPLE_FILE = "sample_submission.csv"
EXAMPLE_PATH = os.path.join(DATA_PATH, EXAMPLE_FILE)

submission = pd.read_csv(EXAMPLE_PATH)
submit = pd.DataFrame(predict_df)
submit["ID"] = submit.index
submit = submit["ID", 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
submit.columns = submission.columns

In [None]:
SAVE_PATH = "../submit" 
SUBMISSION_FILE = "submission1.csv"
SUBMIT_PATH = os.path.join(SAVE_PATH, SUBMISSION_FILE)
submit.to_csv(SUBMIT_PATH, index=False)

In [9]:
# concat 데이터 csv파일로 저장하기
predict_df.to_csv('/Users/haylee/Desktop/Desktop/LG_Aimers/해커톤/제출파일/submission.csv', header=True, index=False)