In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
# Setup notebook
from pathlib import Path
from learntools.time_series.style import *  # plot style settings

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
data_dir = Path('../input/delhiclimate')

In [5]:
dtype = {
    'meantemp': 'float32',
    'humidity': 'float32',
    'wind_speed': 'float32',
    'meanpressure': 'float32',
}

climate_data = pd.read_csv(
    data_dir / 'DailyDelhiClimateTrain.csv',
#    index_col='date',
    parse_dates=['date'],
)
#Create a time dummy: Time-step features
#There are two kinds of features unique to time series: time-step features and lag features.
#Time-step features are features we can derive directly from the time index. 
#The most basic time-step feature is the time dummy, which counts off time steps in the series from beginning to end.

climate_data['Date_index'] = np.arange(len(climate_data.index))

#Lag features
#To make a lag feature we shift the observations of the target series so that they appear to have occured later in time.
#Here we've created a 1-step lag feature, though shifting by multiple steps is possible too.

climate_data['Lag_1'] = climate_data['meantemp'].shift(1)
climate_data = climate_data.reindex(columns=['date', 'meantemp', 'Date_index', 'humidity', 'wind_speed', 'meanpressure','Lag_1'])
climate_data = climate_data.set_index('date').to_period('D')

#climate_data = climate_data.set_index('date').to_period('D')

In [6]:
display(climate_data)

Unnamed: 0_level_0,meantemp,Date_index,humidity,wind_speed,meanpressure,Lag_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-01-01,10.000000,0,84.500000,0.000000,1015.666667,
2013-01-02,7.400000,1,92.000000,2.980000,1017.800000,10.000000
2013-01-03,7.166667,2,87.000000,4.633333,1018.666667,7.400000
2013-01-04,8.666667,3,71.333333,1.233333,1017.166667,7.166667
2013-01-05,6.000000,4,86.833333,3.700000,1016.500000,8.666667
...,...,...,...,...,...,...
2016-12-28,17.217391,1457,68.043478,3.547826,1015.565217,16.850000
2016-12-29,15.238095,1458,87.857143,6.000000,1016.904762,17.217391
2016-12-30,14.095238,1459,89.666667,6.266667,1017.904762,15.238095
2016-12-31,15.052632,1460,87.000000,7.325000,1016.100000,14.095238


In [11]:
#Create multistep dataset for Store Sales
from learntools.time_series.style import *  # plot style settings
from learntools.time_series.utils import (create_multistep_example,
                                          load_multistep_data,
                                          make_lags,
                                          make_multistep_target,
                                          plot_multistep)


# YOUR CODE HERE
y = climate_data.loc[:, 'meantemp']

# YOUR CODE HERE: Make 4 lag features
X = make_lags(y, lags=4).dropna()

# YOUR CODE HERE: Make multistep target
y = make_multistep_target(y, steps=16).dropna()

y, X = y.align(X, join='inner', axis=0)

In [14]:
display(X)

Unnamed: 0_level_0,y_lag_1,y_lag_2,y_lag_3,y_lag_4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-05,8.666667,7.166667,7.400000,10.000000
2013-01-06,6.000000,8.666667,7.166667,7.400000
2013-01-07,7.000000,6.000000,8.666667,7.166667
2013-01-08,7.000000,7.000000,6.000000,8.666667
2013-01-09,8.857143,7.000000,7.000000,6.000000
...,...,...,...,...
2016-12-13,19.909091,20.041667,16.444444,19.416667
2016-12-14,19.050000,19.909091,20.041667,16.444444
2016-12-15,18.555556,19.050000,19.909091,20.041667
2016-12-16,18.166667,18.555556,19.050000,19.909091


In [15]:
display(y)

Unnamed: 0_level_0,y_step_1,y_step_2,y_step_3,y_step_4,y_step_5,y_step_6,y_step_7,y_step_8,y_step_9,y_step_10,y_step_11,y_step_12,y_step_13,y_step_14,y_step_15,y_step_16
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-01-05,6.000000,7.000000,7.000000,8.857143,14.000000,11.000000,15.714286,14.000000,15.833333,12.833333,14.714286,13.833333,16.500000,13.833333,12.500000,11.285714
2013-01-06,7.000000,7.000000,8.857143,14.000000,11.000000,15.714286,14.000000,15.833333,12.833333,14.714286,13.833333,16.500000,13.833333,12.500000,11.285714,11.200000
2013-01-07,7.000000,8.857143,14.000000,11.000000,15.714286,14.000000,15.833333,12.833333,14.714286,13.833333,16.500000,13.833333,12.500000,11.285714,11.200000,9.500000
2013-01-08,8.857143,14.000000,11.000000,15.714286,14.000000,15.833333,12.833333,14.714286,13.833333,16.500000,13.833333,12.500000,11.285714,11.200000,9.500000,14.000000
2013-01-09,14.000000,11.000000,15.714286,14.000000,15.833333,12.833333,14.714286,13.833333,16.500000,13.833333,12.500000,11.285714,11.200000,9.500000,14.000000,13.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-13,19.050000,18.555556,18.166667,15.833333,17.500000,16.083333,17.857143,19.800000,18.050000,17.285714,15.550000,17.318182,14.000000,17.142857,16.850000,17.217391
2016-12-14,18.555556,18.166667,15.833333,17.500000,16.083333,17.857143,19.800000,18.050000,17.285714,15.550000,17.318182,14.000000,17.142857,16.850000,17.217391,15.238095
2016-12-15,18.166667,15.833333,17.500000,16.083333,17.857143,19.800000,18.050000,17.285714,15.550000,17.318182,14.000000,17.142857,16.850000,17.217391,15.238095,14.095238
2016-12-16,15.833333,17.500000,16.083333,17.857143,19.800000,18.050000,17.285714,15.550000,17.318182,14.000000,17.142857,16.850000,17.217391,15.238095,14.095238,15.052632


In [17]:
#Forecast with the DirRec strategy
from sklearn.multioutput import RegressorChain
from xgboost import XGBRegressor

model = RegressorChain(XGBRegressor())

model.fit(X, y)

y_pred = pd.DataFrame(
    model.predict(X),
    index=y.index,
    columns=y.columns,
).clip(0.0)

In [18]:
print(y_pred)

             y_step_1   y_step_2   y_step_3   y_step_4   y_step_5   y_step_6  \
date                                                                           
2013-01-05   6.333965   7.022217   7.096872   8.995094  13.956589  11.032288   
2013-01-06   7.104981   7.086579   8.822689  13.893842  11.498199  16.095760   
2013-01-07   7.089465   8.772837  13.868030  11.065819  15.496002  14.051347   
2013-01-08   8.605704  13.889863  11.126852  15.532516  14.336104  15.810493   
2013-01-09  13.946286  11.133954  15.567716  14.013792  15.851743  13.558587   
...               ...        ...        ...        ...        ...        ...   
2016-12-13  19.002363  18.601749  17.954351  16.077421  17.223200  16.352995   
2016-12-14  18.665792  18.394573  17.103827  18.005766  16.995346  18.306992   
2016-12-15  18.447393  17.307863  17.676020  16.816730  18.129059  18.606752   
2016-12-16  17.413511  17.590801  16.900627  18.585745  18.756262  18.498613   
2016-12-17  17.431412  16.109188  17.669