# Imports

In [1]:
import numpy  as np
import pandas as pd

import matplotlib.pyplot as plt

from dstoolkit.feature_engine import GroupedLagTimeFeatureCreator

## Loading Dataset

In [2]:
# the data is from https://datahub.io/core/global-temp
df = pd.read_csv("../data/global_temperature_time_series.csv")

In [3]:
df.head()

Unnamed: 0,Source,Year,Mean
0,gcag,1850-01,-0.6746
1,gcag,1850-02,-0.3334
2,gcag,1850-03,-0.5913
3,gcag,1850-04,-0.5887
4,gcag,1850-05,-0.5088


In [4]:
df = df.rename({'Source': 'source', 'Year': 'time', 'Mean': 'temperature'}, axis=1)
df['time'] = pd.to_datetime(df['time'])

## Class GroupedLagTimeFeatureCreator

In [6]:
df = df.sample(frac=1)
df.head()

Unnamed: 0,source,time,temperature
3320,GISTEMP,2003-05-01,0.6
249,gcag,1870-10-01,-0.4429
1789,gcag,1939-07-01,-0.0377
845,gcag,1900-03-01,-0.1997
30,gcag,1852-07-01,0.005


In [7]:
s = GroupedLagTimeFeatureCreator(windows=[2, 3, 4, 5, 6], functions=['mean', 'median', 'max', 'min', 'sum', 'std', 'diff', 'slope'])
df = s.create(df, time='time', groupby_col='source', target='temperature')

In [8]:
df['source'].unique()

array(['GISTEMP', 'gcag'], dtype=object)

In [9]:
df.loc[df['source'] == 'gcag', :].tail(10)

Unnamed: 0,source,time,temperature,temperature_lag_1,temperature_lag_2,temperature_lag_3,temperature_lag_4,temperature_lag_5,temperature_lag_6,temperature_mean_2_lag,...,temperature_diff_lag_2_vs_3,temperature_diff_lag_2_vs_4,temperature_diff_lag_2_vs_5,temperature_diff_lag_2_vs_6,temperature_diff_lag_3_vs_4,temperature_diff_lag_3_vs_5,temperature_diff_lag_3_vs_6,temperature_diff_lag_4_vs_5,temperature_diff_lag_4_vs_6,temperature_diff_lag_5_vs_6
3811,gcag,2023-10-01,1.2866,1.3522,1.1993,1.15,1.0518,0.8714,0.9275,1.27575,...,0.0493,0.1475,0.3279,0.2718,0.0982,0.2786,0.2225,0.1804,0.1243,-0.0561
3813,gcag,2023-11-01,1.3338,1.2866,1.3522,1.1993,1.15,1.0518,0.8714,1.3194,...,0.1529,0.2022,0.3004,0.4808,0.0493,0.1475,0.3279,0.0982,0.2786,0.1804
3815,gcag,2023-12-01,1.2586,1.3338,1.2866,1.3522,1.1993,1.15,1.0518,1.3102,...,-0.0656,0.0873,0.1366,0.2348,0.1529,0.2022,0.3004,0.0493,0.1475,0.0982
3816,gcag,2024-01-01,1.1516,1.2586,1.3338,1.2866,1.3522,1.1993,1.15,1.2962,...,0.0472,-0.0184,0.1345,0.1838,-0.0656,0.0873,0.1366,0.1529,0.2022,0.0493
3817,gcag,2024-02-01,1.2902,1.1516,1.2586,1.3338,1.2866,1.3522,1.1993,1.2051,...,-0.0752,-0.028,-0.0936,0.0593,0.0472,-0.0184,0.1345,-0.0656,0.0873,0.1529
3818,gcag,2024-03-01,1.2515,1.2902,1.1516,1.2586,1.3338,1.2866,1.3522,1.2209,...,-0.107,-0.1822,-0.135,-0.2006,-0.0752,-0.028,-0.0936,0.0472,-0.0184,-0.0656
3819,gcag,2024-04-01,1.2053,1.2515,1.2902,1.1516,1.2586,1.3338,1.2866,1.27085,...,0.1386,0.0316,-0.0436,0.0036,-0.107,-0.1822,-0.135,-0.0752,-0.028,0.0472
3820,gcag,2024-05-01,1.0745,1.2053,1.2515,1.2902,1.1516,1.2586,1.3338,1.2284,...,-0.0387,0.0999,-0.0071,-0.0823,0.1386,0.0316,-0.0436,-0.107,-0.1822,-0.0752
3821,gcag,2024-06-01,1.1154,1.0745,1.2053,1.2515,1.2902,1.1516,1.2586,1.1399,...,-0.0462,-0.0849,0.0537,-0.0533,-0.0387,0.0999,-0.0071,0.1386,0.0316,-0.107
3822,gcag,2024-07-01,1.1398,1.1154,1.0745,1.2053,1.2515,1.2902,1.1516,1.09495,...,-0.1308,-0.177,-0.2157,-0.0771,-0.0462,-0.0849,0.0537,-0.0387,0.0999,0.1386


In [11]:
np.mean([1.1154, 1.0745])

np.float64(1.0949499999999999)

In [10]:
df.isna().mean()

source                         0.000000
time                           0.000000
temperature                    0.000000
temperature_sum_1_lag          0.000523
temperature_mean_2_lag         0.001046
                                 ...   
temperature_diff_lag_3_vs_5    0.002616
temperature_diff_lag_3_vs_6    0.003139
temperature_diff_lag_4_vs_5    0.002616
temperature_diff_lag_4_vs_6    0.003139
temperature_diff_lag_5_vs_6    0.003139
Length: 69, dtype: float64