In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# 1. Load Data

In [2]:
df = pd.read_csv("/kaggle/input/sunspots/Sunspots.csv")
display(df.head())
print(df.info())

In [3]:
from dateutil.parser import parse
dateparse=lambda dates:parse(dates)
df = pd.read_csv('/kaggle/input/sunspots/Sunspots.csv',
                 usecols=['Date','Monthly Mean Total Sunspot Number'],
                 parse_dates=['Date'],
                 date_parser=dateparse)
display(df.head())
print(df.info())

# 2. Explore Dataset

In [5]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
df_non_index=df.copy()
df_non_index['Month'] = df_non_index.Date.dt.month
df_non_index['nth_year'] =[int(str(i)[3]) for i in (df_non_index.Date.dt.year)] # Note this is list comprehension 
df_non_index['nth_year'].replace(0,10,inplace=True)
df_non_index.head(10)

In [6]:
fig, axes = plt.subplots(4, 1, figsize=(20,20), dpi= 80)
sns.boxplot(x='Date', y='Monthly Mean Total Sunspot Number', data=df_non_index, ax=axes[0])
sns.boxplot(x='Month', y='Monthly Mean Total Sunspot Number', data=df_non_index,ax = axes[1])
sns.boxplot(x='nth_year', y='Monthly Mean Total Sunspot Number', data=df_non_index,ax = axes[2])
sns.violinplot(x="nth_year", y="Monthly Mean Total Sunspot Number", data=df_non_index,ax = axes[3])
# Set Title
axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=14); 
axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=14)
axes[2].set_title('nth_year_each_decade\n(The Seasonality)', fontsize=14)
axes[3].set_title('nth_year_each_decade ViolinPlot\n(The Seasonality)', fontsize=14)
fig.tight_layout()
plt.show()

In [7]:
import plotly.express as px

fig = px.line(df, x="Date", y="Monthly Mean Total Sunspot Number", title='Monthly Mean Total Sunspot Number')
fig.show()

In [8]:
fig = px.line(df_non_index, x='Date', y='Monthly Mean Total Sunspot Number', title='Mean_Sunspot_Slider')

fig.update_xaxes(
    rangeslider_visible=False,
    rangeselector=dict(
        buttons=list([
            dict(count=10, label="10y", step="year", stepmode="backward"),
            dict(count=20, label="20y", step="year", stepmode="backward"),
            dict(count=30, label="30y", step="year", stepmode="backward"),
            dict(count=40, label="40y", step="year", stepmode="backward"),
            dict(count=50, label="50y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

**In above graph we can see the pattern is repeating after 11 year approx, choose the time to match any two reapeated pattern**

In [9]:
df_11_1985=df_non_index[(df_non_index.Date.dt.year>=1985) & (df_non_index.Date.dt.year<1996)]
df_11_1996=df_non_index[(df_non_index.Date.dt.year>=1996) &(df_non_index.Date.dt.year<2007)]

x=np.arange(1,len(df_11_1996['Date'])+1)

plt.plot(x, df_11_1985['Monthly Mean Total Sunspot Number'],label='df_60_1998')
plt.plot(x, df_11_1996['Monthly Mean Total Sunspot Number'],label='df_60_1958')
plt.legend()
plt.xlabel('Month')
plt.ylabel('Monthly Mean Total Sunspot Number')
plt.title('Comparison of Two consecutive 11 year')
plt.show()

## Lag Plot
* As we increase the lag time, the correlation is decresing.
* The data is correlated with its recet time lag upt 4/5 time lag.

In [10]:
fig=plt.figure(figsize=(18,6))
fig.subplots_adjust(hspace=0.4, wspace=0.2)
ax1=fig.add_subplot(2,2,1)
pd.plotting.lag_plot(df['Monthly Mean Total Sunspot Number'],lag=1)
plt.title('Lag_1')
ax2=fig.add_subplot(2,2,2)
pd.plotting.lag_plot(df['Monthly Mean Total Sunspot Number'],lag=3)
plt.title('Lag_3')
ax3=fig.add_subplot(2,2,3)
pd.plotting.lag_plot(df['Monthly Mean Total Sunspot Number'],lag=6)
plt.title('Lag_6')
ax3=fig.add_subplot(2,2,4)
pd.plotting.lag_plot(df['Monthly Mean Total Sunspot Number'],lag=24)
plt.title('Lag_24')
plt.show()

## Distribution

In [11]:
fig=plt.figure(figsize=(18,6))
fig.subplots_adjust(hspace=0.4, wspace=0.2)
ax1=fig.add_subplot(1,2,1)
df['Monthly Mean Total Sunspot Number'].hist()
plt.title('Histogram')
ax2=fig.add_subplot(1,2,2)
df['Monthly Mean Total Sunspot Number'].plot(kind='density')# kernel density plot
plt.title('KDE')
plt.show()

# 3. Checking Stationarity of Time Series Data
Stationarity is defined using very strict criterion. However, for practical purposes we can assume the series to be stationary if it has constant statistical properties over time, ie. the following:

1. constant mean (For different time slots)
2. constant variance (For different time slots)
3. (Rolling mean/variance should be checked and should be constant)
4. an autocovariance that does not depend on time
5. Two test for stationarity: ADF & KPSS test

### ADF test
* Null Hypothesis - Series is not stationary
* Alternate Hypothesis - Series is stationary

In [12]:
from statsmodels.tsa.stattools import adfuller
data_series=df['Monthly Mean Total Sunspot Number']
print('Results of Dickey-Fuller Test:')
dftest = adfuller(data_series, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
if dfoutput['Test Statistic'] < dfoutput['Critical Value (5%)']:  ## Comparing with 5% significant Level
    print('Series is stationary')
else:
    print('Series is not Stationary')
## OR 
if dfoutput[1] > 0.05 :
    print('Series is not Stationary')
else:
    print('Series is Stationary')

### KPSS test
* Null hypothesis - Series is stationary
* Alternate hypothesis - Series is not stationary

In [13]:
from statsmodels.tsa.stattools import kpss
stats, p, lags, critical_values = kpss(df['Monthly Mean Total Sunspot Number'], 'c',nlags='legacy')

print(f'Test Statistics: {stats}')
print(f'p-value: {p}')
print(f'Critial Values: {critical_values}')

if p < 0.05 :
    print('Series is not Stationary')
else:
    print('Series is Stationary')

**Note: For Non-Stationary data: First make it stationary**

* Differencing, Taking log and Differencing, Decompostion in components and detrending are few techniques are used. 
* Different Moving Averages
* Exponential Smoothing
* ARIMA
* SARIMA 

### Some Techniques to deal with non-stationary dataset
#### Moving Average

In [14]:
df['Monthly Mean Total Sunspot Number'][:200].plot() # Checking for only first 200 data set
df['Monthly Mean Total Sunspot Number'][:200].rolling(3).mean().plot(label='rolling mean') ## rolling average with 3 time step also known as window
#df['Monthly Mean Total Sunspot Number'][:200].rolling(3).std().plot(label='rolling std')
plt.legend()
plt.title('Rolling Mean & Standard Deviation')
## df['Monthly Mean Total Sunspot Number'].rolling(12).mean().shift(1) # Rolling mean with shift
plt.show()

#### Weighted moving average
**Weighted moving average = (tweighting factor) + ((t-1)weighting factor-1) + ((t-n) * weighting factor-n)/n**

This is similar as rolling average except, we multiply with weighting factor so that more weight is given to recent data.


In [15]:
def wma(weights): 
    def calc(x):
        return (weights*x).mean()
    return calc

df['Monthly Mean Total Sunspot Number'][:200].plot() # Checking for only first 200 data set
df['Monthly Mean Total Sunspot Number'][:200].rolling(3).apply(wma(np.array([0.5,1,1.5]))).plot(label='weighted mooving_averate')
#  Here inside wma 3 weights are passed since we are taking 3 time step only as window.
plt.legend()
plt.show()

#### Exponential moving average\Exponential Smoothing

In [16]:
df['Monthly Mean Total Sunspot Number'][:200].plot() # Checking for only first 200 data set
df['Monthly Mean Total Sunspot Number'][:200].ewm(span=3, adjust=False, min_periods=3).mean().plot(label='Exponential Weighted Average')
## Here span=3 is provide thus α=2/(span+1) automatically calculated and applied
## https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
plt.title('Exponential Weighted M.A.')
plt.legend()
plt.show()

In [17]:
df['Monthly Mean Total Sunspot Number'][:200].plot() # Checking for only first 200 data set
df['Monthly Mean Total Sunspot Number'][:200].ewm(alpha=0.7, adjust=False, min_periods=3).mean().plot(label='Exponential Smooting M A')
plt.show()

#### All smoothing method compare

In [26]:
df_with_diff_avg=df[:200].copy()
df_with_diff_avg['Rolling mean']=df['Monthly Mean Total Sunspot Number'][:200].rolling(3).mean()
df_with_diff_avg['W_M_A']= df['Monthly Mean Total Sunspot Number'][:200].rolling(window=3).apply(wma(np.array([0.5,1,1.5])))
df_with_diff_avg['E_W_A']= df['Monthly Mean Total Sunspot Number'][:200].ewm(span=3, adjust=False, min_periods=0).mean()
df_with_diff_avg['E_S_M_A']= df['Monthly Mean Total Sunspot Number'][:200].ewm(alpha=0.7, adjust=False, min_periods=3).mean()
print(df_with_diff_avg.head())
#df_with_diff_avg.set_index('Date', inplace=True)
df_with_diff_avg.plot(x='Date')
plt.show()

#### Making a function for comparing RMSE in all above modelling

In [30]:
df_with_diff_avg.head(3)

In [31]:
def RMSE_CAL(df):
    Rolling_Mean_RMSE=np.sqrt(np.sum((df.iloc[:,1]-df.iloc[:,2])**2))
    W_M_A_RMSE=np.sqrt(np.sum((df.iloc[:,1]-df.iloc[:,3])**2))
    E_W_A_RMSE=np.sqrt(np.sum((df.iloc[:,1]-df.iloc[:,4])**2))
    E_S_M_A_RMSE=np.sqrt(np.sum((df.iloc[:,1]-df.iloc[:,5])**2))
    return("Rolling_Mean_RMSE",Rolling_Mean_RMSE,"W_M_A_RMSE",W_M_A_RMSE,"E_W_A_RMSE",E_W_A_RMSE,"E_S_M_A_RMSE",E_S_M_A_RMSE)
df_with_diff_avg.dropna(inplace=True)
RMSE_CAL(df_with_diff_avg)

# 4. Decomposing a Time_Series Data

In [32]:
# Additive decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df['Monthly Mean Total Sunspot Number'], model="additive",freq=11*12) # Data Trend is repeated after every 11 year,freq=11*12
result.plot()
plt.show()

**Detrended Data**

In [33]:
pd.DataFrame(result.observed-result.trend).plot()
plt.show()

# 5. Autocorrelation Plot

In [34]:
pd.plotting.autocorrelation_plot(df['Monthly Mean Total Sunspot Number']) ## for each month
plt.show()

## ACF and PACF plots:

In [41]:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

fig = sm.tsa.graphics.plot_acf(df['Monthly Mean Total Sunspot Number'], lags=range(1,40), alpha=0.05,title = 'ACF')
fig = sm.tsa.graphics.plot_pacf(df['Monthly Mean Total Sunspot Number'], lags=range(1,40), alpha=0.05, title = 'PACF')



With there being more than 20 lags with a positive correlation above 0.5 there’s a good chance this time series will need a significant level of differencing to achieve stationarity.

In [43]:
# Draw Plot
plot_acf(df['Monthly Mean Total Sunspot Number'].tolist(), lags=30, ax=axes[0])


In [44]:
plot_pacf(df['Monthly Mean Total Sunspot Number'].tolist(), lags=4, ax=axes[1])

## Auto Arima

In [45]:
!pip install pmdarima

In [46]:
import pmdarima as pm
from pmdarima.model_selection import train_test_split

model = pm.auto_arima(df['Monthly Mean Total Sunspot Number'], 
                        m=11, seasonal=True,
                      start_p=0, start_q=0, max_order=4, test='adf',error_action='ignore',  
                           suppress_warnings=True,
                      stepwise=True, trace=True) 

In [47]:
model.summary()

In [48]:
df.reset_index(inplace=True)
train=df[(df.Date.dt.year<1958)]
test=df[(df.Date.dt.year>=1958)]
test1=df[(df.Date.dt.year>=1958) & (df.Date.dt.year<1968)]
n=len(test1)

In [49]:
model.fit(train['Monthly Mean Total Sunspot Number'])
forecast=model.predict(n_periods=n, return_conf_int=True)
forecast_df = pd.DataFrame(forecast[0],index = test1.index,columns=['Prediction'])
pd.concat([df['Monthly Mean Total Sunspot Number'],forecast_df],axis=1).plot()