In [12]:
import random
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA

In [5]:
# Import data
df_train = pd.read_csv('scaled_train.csv', sep=';')
df_test = pd.read_csv('scaled_test.csv', sep=';')

In [10]:
# Check for correct import
pd.set_option('display.max_colwidth', 2000)

df_train.head()

# Check for rows and columns size
# print(df_train.shape)

Unnamed: 0,YYYY,MM,DD,DOY,2m_temp_max,2m_temp_mean,2m_temp_min,2m_dp_temp_max,2m_dp_temp_mean,2m_dp_temp_min,...,surf_net_therm_rad_max,surf_net_therm_rad_mean,surf_press,total_et,prec,volsw_123,volsw_4,elev,lon,lat
0,1981,1,1,1,-1.694344,-1.590316,-1.527666,-1.637383,-1.767661,-1.858007,...,-1.267167,-1.200392,-1.49436,-1.161267,0.281353,-2.133464,-1.708602,0.094791,-0.69264,-0.91951
1,1981,1,2,2,-2.039424,-1.895327,-1.624605,-1.904712,-1.755301,-1.485195,...,-1.529381,-1.336408,-1.500933,-1.078582,1.401365,-2.133464,-1.708602,0.094791,-0.69264,-0.91951
2,1981,1,3,3,-1.538502,-1.437811,-1.28532,-1.306405,-1.310363,-1.147333,...,-1.192249,-1.563102,-1.629353,-1.078582,1.085645,-2.133464,-1.708602,0.094791,-0.69264,-0.91951
3,1981,1,4,4,-1.493976,-1.379155,-1.454962,-1.166376,-1.16205,-1.368691,...,-0.592905,-1.155054,-1.741932,-1.078582,2.424368,-2.133464,-1.708602,0.094791,-0.69264,-0.91951
4,1981,1,5,5,-2.206398,-2.129951,-1.830599,-2.09566,-2.051927,-1.834707,...,-0.855118,-1.109715,-1.662385,-0.905697,2.101592,-2.133464,-1.708602,0.094791,-0.69264,-0.91951


In [26]:
# Extract the relevant columns for time series analysis
# Selecting 'prec' column for prediction

y_train = df_train[['YYYY', 'MM', 'DD', 'prec']]
x_train = df_train.drop(['YYYY', 'MM', 'DD', 'DOY', 'prec'], axis=1)


# Convert date columns to a datetime format

y_train['Date'] = pd.to_datetime(dict(year= y_train['YYYY'], month= y_train['MM'], day= y_train['DD']))
x_train['Date'] = pd.to_datetime(dict(year= y_train['YYYY'], month= y_train['MM'], day= y_train['DD']))

y_train = y_train.set_index('Date')
x_train = x_train.set_index('Date')

y_train.index = pd.DatetimeIndex(y_train.index).to_period('D')
x_train.index = pd.DatetimeIndex(x_train.index).to_period('D')

# Drop the now redundant columns
y_train.drop(['YYYY', 'MM', 'DD'], axis=1, inplace=True)

# Check for correct dataframe
y_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['Date'] = pd.to_datetime(dict(year= y_train['YYYY'], month= y_train['MM'], day= y_train['DD']))


Unnamed: 0_level_0,prec
Date,Unnamed: 1_level_1
1981-01-01,0.281353
1981-01-02,1.401365
1981-01-03,1.085645
1981-01-04,2.424368
1981-01-05,2.101592


In [27]:
# Check for correct x dataframe
x_train.head()

Unnamed: 0_level_0,2m_temp_max,2m_temp_mean,2m_temp_min,2m_dp_temp_max,2m_dp_temp_mean,2m_dp_temp_min,10m_wind_u,10m_wind_v,fcst_alb,lai_high_veg,...,surf_net_solar_rad_mean,surf_net_therm_rad_max,surf_net_therm_rad_mean,surf_press,total_et,volsw_123,volsw_4,elev,lon,lat
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1981-01-01,-1.694344,-1.590316,-1.527666,-1.637383,-1.767661,-1.858007,0.321671,-0.522413,1.314577,-0.065803,...,-1.208113,-1.267167,-1.200392,-1.49436,-1.161267,-2.133464,-1.708602,0.094791,-0.69264,-0.91951
1981-01-02,-2.039424,-1.895327,-1.624605,-1.904712,-1.755301,-1.485195,0.392013,-0.346318,1.379199,-0.065803,...,-1.180225,-1.529381,-1.336408,-1.500933,-1.078582,-2.133464,-1.708602,0.094791,-0.69264,-0.91951
1981-01-03,-1.538502,-1.437811,-1.28532,-1.306405,-1.310363,-1.147333,0.251329,0.358061,1.379199,-0.065803,...,-1.138393,-1.192249,-1.563102,-1.629353,-1.078582,-2.133464,-1.708602,0.094791,-0.69264,-0.91951
1981-01-04,-1.493976,-1.379155,-1.454962,-1.166376,-1.16205,-1.368691,0.110645,-0.61046,1.379199,-0.065803,...,-1.333611,-0.592905,-1.155054,-1.741932,-1.078582,-2.133464,-1.708602,0.094791,-0.69264,-0.91951
1981-01-05,-2.206398,-2.129951,-1.830599,-2.09566,-2.051927,-1.834707,0.392013,-1.138745,1.379199,-0.065803,...,-1.236002,-0.855118,-1.109715,-1.662385,-0.905697,-2.133464,-1.708602,0.094791,-0.69264,-0.91951


In [28]:
# Fit the ARIMA model
# Example order (p,d,q) - adjust as needed
model = ARIMA(y_train, exog=x_train, order=(5, 1, 0))
model_fit = model.fit()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [1]:
# Forecasting example (adjust the number of periods)
# Example: forecast for the next 10 periods
#forecast = model_fit.forecast(steps=10, exog=df_test.iloc[-10:])
df_test.head()

NameError: name 'df_test' is not defined