<a href="https://colab.research.google.com/github/karunamayi0604/time-series/blob/main/Arima_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
df=pd.read_csv("/content/drive/MyDrive/arima/Netflix-Subscriptions.csv")

In [17]:
df.head()

Unnamed: 0,Time Period,Subscribers
0,01/04/2013,34240000
1,01/07/2013,35640000
2,01/10/2013,38010000
3,01/01/2014,41430000
4,01/04/2014,46130000


### Correct the format


In [25]:
df['Time Period']=pd.to_datetime(df['Time Period'],format='%d/%m/%Y')
df.set_index('Time Period',inplace=True)
df=df.sort_index()

In [26]:
df.head()

Unnamed: 0_level_0,Subscribers
Time Period,Unnamed: 1_level_1
2013-04-01,34240000
2013-07-01,35640000
2013-10-01,38010000
2014-01-01,41430000
2014-04-01,46130000


In [27]:
import plotly.express as px
import plotly.graph_objects as go
fig=go.Figure()
fig.add_trace(go.Scatter(x=df.index,y=df['Subscribers']))
fig.update_layout(title="Netflix Subscribers over time",yaxis_title="Subscribers",xaxis_title="Year")
fig.show()

In [35]:
from statsmodels.tsa.stattools import adfuller
result = adfuller(df['Subscribers'].dropna())
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')


ADF Statistic: 0.20591078338329172
p-value: 0.972591170013349


In [28]:
from statsmodels.tsa.stattools import adfuller
def check_stationarity(timeseries):
  roll_mean=timeseries.rolling(window=12).mean()
  roll_std=timeseries.rolling(window=12).std()
  fig=go.Figure()
  fig.add_trace(go.Scatter(x=timeseries.index,y=timeseries,mode='lines',name='original'))
  fig.add_trace(go.Scatter(x=roll_mean.index,y=roll_mean,mode='lines',name='rolling mean',line=dict(color='red',dash='dash')))
  fig.add_trace(go.Scatter(x=roll_std.index,y=roll_std,mode='lines',name='Rolling std',line=dict(color='green',dash='dot')))

  fig.update_layout(title="Netflix Subscribers over time",yaxis_title="Subscribers",xaxis_title="Year")
  fig.show()
check_stationarity(df['Subscribers'])



### p value is more than 0.05 so we go for differencing

In [29]:
df['Subscribers_diff']=df['Subscribers'].diff().dropna()

In [36]:
from statsmodels.tsa.stattools import adfuller
result = adfuller(df['Subscribers_diff'].dropna())
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

ADF Statistic: -4.523088428253382
p-value: 0.00017858926729786176


In [30]:
check_stationarity(df['Subscribers_diff'])

### here p value is less than 0.05 still lets try log transformation too

In [32]:
import numpy as np

In [33]:
df['Subscribers_log'] = np.log(df['Subscribers'])
check_stationarity(df['Subscribers_log'])

but here the std is too low so we cant go with this


In [34]:
df['Subscribers_diff1'] = df['Subscribers'].diff()
check_stationarity(df['Subscribers_diff1'])

In [47]:
from statsmodels.tsa.stattools import acf,pacf
def ap(data,lags=20):
  data=data.dropna()
  acf_values=acf(data,nlags=lags)
  pacf_values=pacf(data,nlags=lags)
  fig_acf=go.Figure()
  fig_acf.add_trace(go.Bar(x=list(range(1,lags+1)),y=acf_values, name='ACF'))
  fig_acf.update_layout(title="ACF",xaxis_title="lags",yaxis_title="acf")
  fig_acf.show()
  fig_pacf=go.Figure()
  fig_pacf.add_trace(go.Bar(x=list(range(1,lags+1)), y=pacf_values, name='PACF'))
  fig_pacf.update_layout(title="PACF",xaxis_title="lags",yaxis_title="pacf")
  fig_pacf.show()
ap(df['Subscribers_diff'],lags=20)

p and q can be 1 or 3

In [51]:
from statsmodels.tsa.arima.model import ARIMA

model=ARIMA(df['Subscribers'],order=(3,1,3))
model_fit=model.fit()
print(model_fit.summary())


No frequency information was provided, so inferred frequency QS-OCT will be used.



                               SARIMAX Results                                
Dep. Variable:            Subscribers   No. Observations:                   42
Model:                 ARIMA(3, 1, 3)   Log Likelihood                -669.387
Date:                Sun, 09 Feb 2025   AIC                           1352.774
Time:                        05:59:17   BIC                           1364.769
Sample:                    04-01-2013   HQIC                          1357.141
                         - 07-01-2023                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.9403      0.158      5.950      0.000       0.631       1.250
ar.L2         -0.9336      0.246     -3.797      0.000      -1.416      -0.452
ar.L3          0.9933      0.120      8.300      0.0

In [52]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model
model = ARIMA(df['Subscribers'], order=(1,1,1))
model_fit = model.fit()

# Print model summary
print(model_fit.summary())



No frequency information was provided, so inferred frequency QS-OCT will be used.



                               SARIMAX Results                                
Dep. Variable:            Subscribers   No. Observations:                   42
Model:                 ARIMA(1, 1, 1)   Log Likelihood                -672.993
Date:                Sun, 09 Feb 2025   AIC                           1351.986
Time:                        05:59:30   BIC                           1357.127
Sample:                    04-01-2013   HQIC                          1353.858
                         - 07-01-2023                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.9997      0.012     80.765      0.000       0.975       1.024
ma.L1         -0.9908      0.221     -4.476      0.000      -1.425      -0.557
sigma2      1.187e+13   1.57e-14   7.57e+26      0.0

In [58]:
def plot_forecast(df, forecast, steps=102, freq='Q'):
    future = pd.date_range(start=df.index[-1], periods=steps+1, freq=freq)[1:]

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=df.index, y=df['Subscribers'],
        mode='lines', name='Actual',
        line=dict(color='blue')
    ))

    fig.add_trace(go.Scatter(
        x=future, y=forecast,
        mode='lines', name='Forecast',
        line=dict(color='red', dash='dash')
    ))

    fig.update_layout(
        title="Netflix Subscribers Forecast",
        xaxis_title="Time",
        yaxis_title="Subscribers",
        template="plotly_white",
        width=900,
        height=500
    )

    fig.show()

# Forecasting
forecast = model_fit.forecast(steps=12)
plot_forecast(df, forecast, steps=12, freq='Q')


'Q' is deprecated and will be removed in a future version, please use 'QE' instead.

