In [1]:
import pandas as pd

In [2]:
# load data
train_df = pd.read_csv("../open/train.csv")
submission_df = pd.read_csv("../open/sample_submission.csv")

In [3]:
train_df.head()

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
0,1960-01-01,2.2,-5.2,7.4,,68.3,1.7,6.7,,,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,,,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,,,4.0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,,,7.5
4,1960-01-05,1.3,-8.2,9.5,,44.0,5.1,8.2,,,-4.6


Q: Null 데이터를 어떻게 처리할 것인가
    Q_1: Null 데이터 분포에 따라 다르게 처리할 수 있지 않을까?
    강수량 - 곳곳에 Null이 존재함. 
    일사합 - 오래된 연도에는 데이터가 존재하지 않음. 단, 일조율보다 더 데이터가 부족함
    일조율 - 오래된 연도에는 데이터가 존재하지 않음. 하지만, 일사합 보다는 데이터가 많음

In [4]:
train_df.sample(10)

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
8880,1984-04-24,23.0,8.8,14.2,,50.8,2.5,10.5,18.46,77.8,15.7
3369,1969-03-23,10.1,-1.9,12.0,,51.0,2.3,10.6,,86.9,3.0
2064,1965-08-26,30.4,20.8,9.6,,79.3,1.6,6.6,,50.0,24.6
4114,1971-04-07,21.6,7.8,13.8,,40.3,2.1,9.0,,70.3,14.5
11516,1991-07-13,29.1,20.4,8.7,0.0,73.0,2.3,7.7,14.75,52.7,24.5
18571,2010-11-05,15.1,7.0,8.1,,80.1,2.0,2.2,7.15,21.0,11.2
21333,2018-05-29,25.9,19.1,6.8,1.0,69.3,1.8,3.4,13.34,23.4,21.4
19526,2013-06-17,25.8,19.9,5.9,0.5,76.8,2.2,1.1,7.97,7.4,22.3
105,1960-04-15,14.6,6.2,8.4,0.1,67.3,3.2,9.6,,,10.0
3720,1970-03-09,2.2,-3.6,5.8,4.5,76.0,3.3,7.1,,60.7,-0.8


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23011 entries, 0 to 23010
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   일시      23011 non-null  object 
 1   최고기온    23008 non-null  float64
 2   최저기온    23008 non-null  float64
 3   일교차     23007 non-null  float64
 4   강수량     9150 non-null   float64
 5   평균습도    23011 non-null  float64
 6   평균풍속    23007 non-null  float64
 7   일조합     22893 non-null  float64
 8   일사합     18149 non-null  float64
 9   일조율     22645 non-null  float64
 10  평균기온    23011 non-null  float64
dtypes: float64(10), object(1)
memory usage: 1.9+ MB


In [8]:
train_df.isnull().sum()

일시          0
최고기온        3
최저기온        3
일교차         4
강수량     13861
평균습도        0
평균풍속        4
일조합       118
일사합      4862
일조율       366
평균기온        0
dtype: int64

### 2. 데이터 전처리

In [9]:
# 날짜 데이터 변환
train_df['일시'] = pd.to_datetime(train_df['일시'])
train_df = train_df.set_index('일시')
train_df.tail()


Unnamed: 0_level_0,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-12-27,3.3,-7.3,10.6,,69.8,1.8,8.8,10.25,91.7,-2.6
2022-12-28,0.1,-6.0,6.1,0.1,58.1,2.5,8.7,10.86,90.6,-3.3
2022-12-29,2.1,-7.8,9.9,0.0,56.3,1.7,9.0,10.88,93.8,-2.9
2022-12-30,2.3,-4.4,6.7,0.0,65.6,1.9,7.9,10.84,82.3,-1.8
2022-12-31,2.1,-5.1,7.2,0.0,65.5,1.4,1.1,4.16,11.5,-1.2


In [10]:
# 데이터의 시간 간격 지정
train_df.index.freq = "D" # ?
train_df.tail()

Unnamed: 0_level_0,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-12-27,3.3,-7.3,10.6,,69.8,1.8,8.8,10.25,91.7,-2.6
2022-12-28,0.1,-6.0,6.1,0.1,58.1,2.5,8.7,10.86,90.6,-3.3
2022-12-29,2.1,-7.8,9.9,0.0,56.3,1.7,9.0,10.88,93.8,-2.9
2022-12-30,2.3,-4.4,6.7,0.0,65.6,1.9,7.9,10.84,82.3,-1.8
2022-12-31,2.1,-5.1,7.2,0.0,65.5,1.4,1.1,4.16,11.5,-1.2


### 3. ARIMA 모델 설정 및 학습

In [13]:
from statsmodels.tsa.arima.model import ARIMA

In [15]:
# ARIMA 모델 훈련
model = ARIMA(train_df['평균기온'], order=(2, 1, 3)) # p, d, q 값
model_fit = model.fit()



### 4. 예측 수행

In [17]:
submission_df['일시'] = pd.to_datetime(submission_df['일시'])

# 예측할 기간 설정
start_date = submission_df['일시'].min()
end_date = submission_df['일시'].max()

# ARIMA 모델 사용하여 예측
forecast = model_fit.predict(start=start_date, end=end_date, type='levels')



In [18]:
submission_df['평균기온'] = forecast.values
display(submission_df.head())

Unnamed: 0,일시,평균기온
0,2023-01-01,-2.922955
1,2023-01-02,-4.701429
2,2023-01-03,-5.781571
3,2023-01-04,-6.478227
4,2023-01-05,-6.963622


In [20]:
# BaseLine 저장
submission_df.to_csv("./baseline_submit.csv", index=False)

### prophet

In [13]:
pip install prophet

Collecting prophet
  Downloading prophet-1.1.5-py3-none-win_amd64.whl.metadata (3.6 kB)
Collecting cmdstanpy>=1.0.4 (from prophet)
  Downloading cmdstanpy-1.2.0-py3-none-any.whl.metadata (3.9 kB)
Collecting matplotlib>=2.0.0 (from prophet)
  Downloading matplotlib-3.8.2-cp311-cp311-win_amd64.whl.metadata (5.9 kB)
Collecting holidays>=0.25 (from prophet)
  Downloading holidays-0.38-py3-none-any.whl.metadata (21 kB)
Collecting tqdm>=4.36.1 (from prophet)
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB 3.2 MB/s eta 0:00:00
Collecting importlib-resources (from prophet)
  Downloading importlib_resources-6.1.1-py3-none-any.whl.metadata (4.1 kB)
Collecting stanio~=0.3.0 (from cmdstanpy>=1.0.4->prophet)
  Downloading stanio-0.3.0-py3-none-any.whl.metadata (963 bytes)
Collecting contourpy>=1.0.1 (from matplotlib>=2.0.0->prophet)
  Downloading contourpy-1

In [14]:
from prophet import Prophet

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [11]:
# prophet에서 데이터를 인식하도록 일시는 ds로 target값은 y로 지정함
train_df = train_df.reset_index()
train_df = train_df.rename(columns={'일시':'ds', '평균기온':'y'})

In [15]:
# 모델 학습
prophet = Prophet()
prophet.fit(train_df)

17:00:34 - cmdstanpy - INFO - Chain [1] start processing
17:00:38 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x2856b13ae10>

In [16]:
# 모델 예측
future_data = prophet.make_future_dataframe(periods=358, freq='d') # periods는 예측할 기간
forecast_data = prophet.predict(future_data)
forecast_data[['ds', 'yhat']].tail()

Unnamed: 0,ds,yhat
23364,2023-12-20,0.404827
23365,2023-12-21,0.297787
23366,2023-12-22,0.16552
23367,2023-12-23,0.10042
23368,2023-12-24,-0.087561


In [17]:
submission_df['평균기온'] = forecast_data.yhat[-358:].values
submission_df

Unnamed: 0,일시,평균기온
0,2023-01-01,-1.057749
1,2023-01-02,-1.159354
2,2023-01-03,-1.275385
3,2023-01-04,-1.340379
4,2023-01-05,-1.378852
...,...,...
353,2023-12-20,0.404827
354,2023-12-21,0.297787
355,2023-12-22,0.165520
356,2023-12-23,0.100420


In [18]:
submission_df.to_csv("prophet_baseline_submission.csv", index=False)