In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib qt5

In [2]:
"""index stuff"""

df = pd.read_csv("./data/daily-min-temperatures.csv")
df.set_index(pd.DatetimeIndex(df['Date']), inplace=True)
df.head()

Unnamed: 0_level_0,Date,Temp
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1981-01-01,1981-01-01,20.7
1981-01-02,1981-01-02,17.9
1981-01-03,1981-01-03,18.8
1981-01-04,1981-01-04,14.6
1981-01-05,1981-01-05,15.8


In [3]:
"""filling nans"""

from scipy.interpolate import interp1d

df = df.resample('D').mean()
df_nona = df.dropna(subset=['Temp'])
choice = 6

if choice == 1:
    df = df.ffill()
elif choice == 2:
    df = df.bfill()
elif choice == 3:
    f = interp1d(pd.to_numeric(df_nona.index), df_nona['Temp'])
    df['Temp'] = f(pd.to_numeric(df.index))
elif choice == 4:
    f = interp1d(pd.to_numeric(df_nona.index), df_nona['Temp'], kind='cubic')
    df['Temp'] = f(pd.to_numeric(df.index))
elif choice == 5:
    def knn_mean(ts, n):
        out = np.copy(ts)
        for i, val in enumerate(ts):
            if np.isnan(val):
                n_by_2 = np.ceil(n/2)
                lower = np.max([0, int(i-n_by_2)])
                upper = np.min([len(ts)+1, int(i+n_by_2)])
                ts_near = np.concatenate([ts[lower:i], ts[i:upper]])
                out[i] = np.nanmean(ts_near)
        return out
    df['Temp'] = knn_mean(df['Temp'].values, 8)
elif choice == 6:
    def seasonal_mean(ts, n, lr=0.7):
        out = np.copy(ts)
        for i, val in enumerate(ts):
            if np.isnan(val):
                ts_seas = ts[i-1::-n]  # previous seasons only
                if np.isnan(np.nanmean(ts_seas)):
                    ts_seas = np.concatenate([ts[i-1::-n], ts[i::n]])  # previous and forward
                out[i] = np.nanmean(ts_seas) * lr
        return out
    df['Temp'] = seasonal_mean(df['Temp'].values, n=365, lr=1.25)
    
print(df.isna().sum())

Temp    0
dtype: int64


In [4]:
"""fft"""

from scipy.fft import fft

yf = fft(df['Temp'].to_numpy())
fig, fax = plt.subplots()
fax.plot(np.abs(yf))

[<matplotlib.lines.Line2D at 0x7f006cf4f2e0>]

In [5]:
"""resampling"""

fig, fax = plt.subplots()
fax.plot(df['Temp'], marker='.', ms=0.1)
fax.plot(df['Temp'].resample('M').mean())
fax.plot(df['Temp'].resample('Y').mean())

[<matplotlib.lines.Line2D at 0x7f006c5ed070>]

In [6]:
"""rolling"""

fig, fax = plt.subplots()
fax.plot(df['Temp'].rolling(window='30D').mean())
fax.plot(df['Temp'].rolling(window='30D').std())

[<matplotlib.lines.Line2D at 0x7f006c428430>]

In [7]:
"""getting static manually"""

y = df['Temp']
y_mean_30d = y.rolling(window='30D').mean()
y_std_30d = y.rolling(window='30D').std()
y_stat = (y - y_mean_30d) / y_std_30d
y_stat = y_stat.bfill()
fig, fax = plt.subplots()
fax.plot(y_stat)

[<matplotlib.lines.Line2D at 0x7f006c3f9550>]

In [14]:
"""getting static with seasonal_decompose"""

from statsmodels.tsa.seasonal import seasonal_decompose

dec = seasonal_decompose(y, model='additive', extrapolate_trend='freq', period=365)
fig, faxes = plt.subplots(3, 1)
faxes[0].plot(dec.trend)
faxes[1].plot(dec.seasonal)
faxes[2].plot(dec.resid)
plt.tight_layout()

y_stat = y - dec.trend - dec.seasonal
#fig, fax = plt.subplots()
#fax.plot(y_stat)

In [9]:
"""checking how static it is"""

from statsmodels.tsa.stattools import adfuller, kpss

result = adfuller(y_stat, autolag='AIC')
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

result = kpss(y_stat, regression='c')
print('KPSS Statistic: %f' % result[0])
print('p-value: %f' % result[1])

ADF Statistic: -18.84912460303161
p-value: 0.0
KPSS Statistic: 0.020665
p-value: 0.100000


look-up table. The actual p-value is greater than the p-value returned.



In [20]:
"""auto-correlation"""

from statsmodels.tsa.stattools import acf, pacf

acf_50 = acf(y_stat, nlags=50)
pacf_50 = pacf(y_stat, nlags=50)

fig, faxes = plt.subplots(2, 1)
faxes[0].plot(acf_50)
faxes[1].plot(pacf_50)

[<matplotlib.lines.Line2D at 0x7f005c746f10>]

In [30]:
"""smoothening"""

from statsmodels.nonparametric.smoothers_lowess import lowess

fig, faxes = plt.subplots(3, 1)

y_smooth = y_stat.rolling(5, center=True, closed='both').mean()
faxes[0].plot(y_stat, label='stat')
faxes[0].plot(y_smooth, label='smooth')
faxes[0].legend()

y_smooth = pd.DataFrame(lowess(y_stat, np.arange(len(y_stat)), frac=0.05)[:, 1],
                        index=y_stat.index, columns=['Temp'])
faxes[1].plot(y_stat, label='stat')
faxes[1].plot(y_smooth, label='smooth')

y_smooth = pd.DataFrame(lowess(y_stat, np.arange(len(y_stat)), frac=0.15)[:, 1],
                        index=y_stat.index, columns=['Temp'])
faxes[2].plot(y_stat, label='stat')
faxes[2].plot(y_smooth, label='smooth')

[<matplotlib.lines.Line2D at 0x7f0057dd25e0>]

In [20]:
from statsmodels.tsa.api import SimpleExpSmoothing

y_to_train = y[:-300]

fit = SimpleExpSmoothing(y_to_train).fit()
fcast = fit.forecast(50)
fig, fax = plt.subplots()
fax.plot(y_to_train[-300:-250].to_numpy())
fax.plot(fcast.to_numpy())



[<matplotlib.lines.Line2D at 0x7f980464db50>]

In [21]:
from statsmodels.tsa.api import Holt

y_to_train = y[:-300]

fit = Holt(y_to_train).fit()
fcast = fit.forecast(50)
fig, fax = plt.subplots()
fax.plot(y_to_train[-300:-250].to_numpy())
fax.plot(fcast.to_numpy())



[<matplotlib.lines.Line2D at 0x7f980432fe80>]

In [28]:
from statsmodels.tsa.api import ExponentialSmoothing

y_to_train = y[:-300]

fit = ExponentialSmoothing(y_to_train, seasonal_periods=12, trend='add', seasonal='add').fit()
fcast = fit.forecast(50)
fig, fax = plt.subplots()
fax.plot(y.to_numpy())
fax.plot(fcast)



[<matplotlib.lines.Line2D at 0x7f9803b505b0>]