In [1]:
import yfinance as yf
import plotly.graph_objects as go

In [2]:
ticker_symbol = 'AAPL'
start_date = '2021-01-01'
end_date = '2023-01-01'
data = yf.download(ticker_symbol, start=start_date, end=end_date)

  data = yf.download(ticker_symbol, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed


In [3]:
data

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2021-01-04,125.974480,130.062977,123.394829,129.975370,143301900
2021-01-05,127.531952,128.242583,125.020444,125.468238,97664900
2021-01-06,123.239059,127.570927,123.024899,124.329329,155088000
2021-01-07,127.444374,128.135532,124.465612,124.952339,109578200
2021-01-08,128.544388,129.108992,126.772697,128.914290,105158200
...,...,...,...,...,...
2022-12-23,129.900269,130.451943,127.713261,128.974237,63814900
2022-12-27,128.097504,129.456999,126.806975,129.427446,69007800
2022-12-28,124.166794,129.082630,123.999322,127.742842,85438400
2022-12-29,127.683723,128.540789,125.831667,126.087797,75703700


In [4]:
# Create the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=data.index, y=data['Close'], mode='lines', name='Close Price'))
fig.update_layout(title=f'{ticker_symbol} Stock Prices from Yahoo Finance',
                  xaxis_title='Date',
                  yaxis_title='Price',
                  template='simple_white',
                  width=900,
                  height=500) 
fig.show()

# Stationary Time Series

A time series is a time series whose statistical properties do not change over time.

A time series is stationary if its:

Mean is constant

Variance is constant

Autocovariance (relationship between points at different lags) is constant over time


A time series ùëãùë° is strictly stationary if the joint probability distribution of

(Xt1‚Äã‚Äã,Xt2‚Äã‚Äã,‚Ä¶,Xtk‚Äã‚Äã)

is the same as the distribution of

(Xt1‚Äã+œÑ‚Äã,Xt2‚Äã+œÑ‚Äã,‚Ä¶,Xtk‚Äã+œÑ‚Äã)

for any time shift ùúè.
i.e.
if you slide the time window, the statistical behavior stays the same.

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np 
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from scipy.stats import boxcox


### Time Series Components
Time series typically have: **Trend** (long-term direction), **Seasonality** (repeating patterns), and **Residuals** (random fluctuations).


### Autocorrelation Function (ACF)
ACF measures correlation between a series and its lagged versions. For white noise, ACF shows no significant correlation.


In [None]:
# Plot ACF for the white noise series
plt.figure(figsize=(10, 4))
plot_acf(stationary_data, lags=40, ax=plt.gca(), title='ACF of White Noise')
plt.show()


### Partial Autocorrelation Function (PACF)
PACF measures direct correlation between a series and its lagged versions, after removing intermediate effects. For white noise, PACF also shows no significant correlation.


In [None]:
# Plot PACF for the white noise series
plt.figure(figsize=(10, 4))
plot_pacf(stationary_data, lags=40, ax=plt.gca(), title='PACF of White Noise')
plt.show()


### Example 1: White Noise (Stationary Time Series)
White noise is a classic example of a stationary time series. It has a constant mean (usually zero), constant variance, and no autocorrelation (meaning past values don't predict future values).


In [6]:
# Generate a stationary time series (white noise)
np.random.seed(42)
stationary_data = np.random.normal(loc=0, scale=1, size=200)
stationary_df = pd.DataFrame({'Value': stationary_data, 'Time': pd.date_range(start='2021-01-01', periods=len(stationary_data), freq='D')})

fig_stationary = px.line(stationary_df, x='Time', y='Value', title='Stationary Time Series (White Noise)')
fig_stationary.show()

print(f"Mean of stationary series: {np.mean(stationary_data):.2f}")
print(f"Variance of stationary series: {np.var(stationary_data):.2f}")


Mean of stationary series: -0.04
Variance of stationary series: 0.86


### Example 2: Random Walk with Drift (Non-Stationary Time Series)
A random walk with a drift is a common example of a non-stationary time series. It tends to wander without returning to a mean, and its variance increases over time. The 'drift' adds a constant upward or downward trend.


In [7]:
# Generate a non-stationary time series (random walk with drift)
np.random.seed(42)
non_stationary_data = np.cumsum(np.random.normal(loc=0.1, scale=1, size=200))
non_stationary_df = pd.DataFrame({'Value': non_stationary_data, 'Time': pd.date_range(start='2021-01-01', periods=len(non_stationary_data), freq='D')})

fig_non_stationary = px.line(non_stationary_df, x='Time', y='Value', title='Non-Stationary Time Series (Random Walk with Drift)')
fig_non_stationary.show()

print(f"Mean of non-stationary series (first half): {np.mean(non_stationary_data[:100]):.2f}")
print(f"Mean of non-stationary series (second half): {np.mean(non_stationary_data[100:]):.2f}")
print(f"Variance of non-stationary series (first half): {np.var(non_stationary_data[:100]):.2f}")
print(f"Variance of non-stationary series (second half): {np.var(non_stationary_data[100:]):.2f}")


Mean of non-stationary series (first half): -1.36
Mean of non-stationary series (second half): 5.62
Variance of non-stationary series (first half): 8.34
Variance of non-stationary series (second half): 25.51


### Augmented Dickey-Fuller (ADF) Test
The Augmented Dickey-Fuller (ADF) test is a statistical test used to determine if a time series is stationary. The null hypothesis (H0) of the ADF test is that the time series is non-stationary (it has a unit root).

- **If the p-value is less than or equal to the significance level (e.g., 0.05)**, we reject the null hypothesis and conclude that the time series is stationary.
- **If the p-value is greater than the significance level**, we fail to reject the null hypothesis, meaning the time series is likely non-stationary.


### Seasonality in Time Series
Seasonality refers to predictable and recurring patterns or cycles in a time series that occur at regular intervals. These patterns are often related to calendar cycles (e.g., time of day, week, month, year). Key characteristics include:

*   **Fixed Frequency**: The length of the seasonal cycle is constant.
*   **Predictability**: The patterns repeat predictably over time.
*   **Impact on Mean/Variance**: Seasonality often causes the mean and/or variance of the series to change across different periods, leading to non-stationarity. For example, retail sales often spike during holidays and dip afterward.


In [None]:
# Simple Seasonal Example
np.random.seed(42)
# Creating a yearly seasonal pattern over 5 years (5 * 12 months)
dates = pd.date_range(start='2020-01-01', periods=60, freq='MS') # Monthly start frequency
seasonal_values = 50 + 15 * np.sin(np.linspace(0, 2 * np.pi * 5, 60)) + np.random.normal(0, 2, 60)
simple_seasonal_df = pd.DataFrame({'Value': seasonal_values, 'Date': dates})

fig_simple_seasonal = px.line(simple_seasonal_df, x='Date', y='Value', title='Simple Seasonal Time Series (Monthly)')
fig_simple_seasonal.show()

# Perform ADF test on simple seasonal series
result_simple_seasonal = adfuller(simple_seasonal_df['Value'])
print('\nADF Test for Simple Seasonal Series:')
print(f'ADF Statistic: {result_simple_seasonal[0]:.2f}')
print(f'P-value: {result_simple_seasonal[1]:.3f}')
if result_simple_seasonal[1] <= 0.05:
    print("Conclusion: The series is stationary (reject H0).")
else:
    print("Conclusion: The series is non-stationary (fail to reject H0).")


In [8]:
# Perform ADF test on the stationary series
result_stationary = adfuller(stationary_data)

print('ADF Test for Stationary Series:')
print(f'ADF Statistic: {result_stationary[0]:.2f}')
print(f'P-value: {result_stationary[1]:.3f}')
print('Critical Values:')
for key, value in result_stationary[4].items():
    print(f'   {key}: {value:.2f}')

if result_stationary[1] <= 0.05:
    print("Conclusion: The time series is stationary (reject H0).")
else:
    print("Conclusion: The time series is non-stationary (fail to reject H0).")


ADF Test for Stationary Series:
ADF Statistic: -14.74
P-value: 0.000
Critical Values:
   1%: -3.46
   5%: -2.88
   10%: -2.57
Conclusion: The time series is stationary (reject H0).


In [9]:
# Perform ADF test on the non-stationary series
result_non_stationary = adfuller(non_stationary_data)

print('\nADF Test for Non-Stationary Series:')
print(f'ADF Statistic: {result_non_stationary[0]:.2f}')
print(f'P-value: {result_non_stationary[1]:.3f}')
print('Critical Values:')
for key, value in result_non_stationary[4].items():
    print(f'   {key}: {value:.2f}')

if result_non_stationary[1] <= 0.05:
    print("Conclusion: The time series is stationary (reject H0).")
else:
    print("Conclusion: The time series is non-stationary (fail to reject H0).")



ADF Test for Non-Stationary Series:
ADF Statistic: -0.56
P-value: 0.879
Critical Values:
   1%: -3.46
   5%: -2.88
   10%: -2.57
Conclusion: The time series is non-stationary (fail to reject H0).


### Example 3: Time Series with a Deterministic Trend (Non-Stationary)
A deterministic trend means that the series has a predictable, constant increase or decrease over time. Such a series is non-stationary because its mean is constantly changing.


In [10]:
# Generate data with a deterministic trend
np.random.seed(42)
time = np.arange(200)
trend_data = 0.5 * time + np.random.normal(loc=0, scale=5, size=200) # Linear trend + noise
trend_df = pd.DataFrame({'Value': trend_data, 'Time': pd.date_range(start='2021-01-01', periods=len(trend_data), freq='D')})

fig_trend = px.line(trend_df, x='Time', y='Value', title='Time Series with Deterministic Trend')
fig_trend.show()

# Perform ADF test on trend series
result_trend = adfuller(trend_data)
print('\nADF Test for Trend Series:')
print(f'ADF Statistic: {result_trend[0]:.2f}')
print(f'P-value: {result_trend[1]:.3f}')
if result_trend[1] <= 0.05:
    print("Conclusion: The time series is stationary (reject H0).")
else:
    print("Conclusion: The time series is non-stationary (fail to reject H0).")



ADF Test for Trend Series:
ADF Statistic: -0.07
P-value: 0.952
Conclusion: The time series is non-stationary (fail to reject H0).


### Example 4: Time Series with Seasonality (Non-Stationary)
Seasonality refers to periodic fluctuations in a time series that repeat over a fixed interval. This also makes a series non-stationary as the statistical properties change with the season.


### Box-Cox Transformation
The Box-Cox transformation is a powerful statistical technique used to transform non-normally distributed data into a more Gaussian-like distribution. In time series, it's often applied to stabilize variance and make the series more amenable to models that assume constant variance. It works only for positive data.


In [None]:
# Load the air passengers data (already loaded in data variable)
# We need to ensure the data is positive for Box-Cox transformation
# The '#Passengers' column is already positive.

# Apply Box-Cox transformation
# Note: The original 'data' DataFrame contains Month and #Passengers columns. We need to apply Box-Cox to #Passengers.
# For simplicity, let's assume 'data' is still the air_passengers_data from earlier in the notebook.
# If not, you might need to re-run the cell where air_passengers.csv is loaded (Cell 14).

# Ensure the '#Passengers' column is numeric
data['#Passengers'] = pd.to_numeric(data['#Passengers'])

transformed_data, lambda_val = boxcox(data['#Passengers'])

# Create a DataFrame for plotting transformed data
boxcox_df = pd.DataFrame({'Value': transformed_data, 'Time': data['Month']})

fig_boxcox = px.line(boxcox_df, x='Time', y='Value', title=f'Air Passengers Data after Box-Cox Transformation (lambda={lambda_val:.2f})')
fig_boxcox.show()

# Perform ADF test on transformed data
result_boxcox = adfuller(transformed_data)
print('\nADF Test for Box-Cox Transformed Series:')
print(f'ADF Statistic: {result_boxcox[0]:.2f}')
print(f'P-value: {result_boxcox[1]:.3f}')
if result_boxcox[1] <= 0.05:
    print("Conclusion: The transformed series is stationary (reject H0).")
else:
    print("Conclusion: The transformed series is non-stationary (fail to reject H0).")


In [None]:
# Generate data with seasonality
np.random.seed(42)
seasonal_data = 10 * np.sin(np.linspace(0, 3 * np.pi, 200)) + np.random.normal(loc=0, scale=2, size=200) # Seasonal pattern + noise
seasonal_df = pd.DataFrame({'Value': seasonal_data, 'Time': pd.date_range(start='2021-01-01', periods=len(seasonal_data), freq='D')})

fig_seasonal = px.line(seasonal_df, x='Time', y='Value', title='Time Series with Seasonality')
fig_seasonal.show()

# Perform ADF test on seasonal series
result_seasonal = adfuller(seasonal_data)
print('\nADF Test for Seasonal Series:')
print(f'ADF Statistic: {result_seasonal[0]:.2f}')
print(f'P-value: {result_seasonal[1]:.3f}')
if result_seasonal[1] <= 0.05:
    print("Conclusion: The time series is stationary (reject H0).")
else:
    print("Conclusion: The time series is non-stationary (fail to reject H0).")


### Detrending and Deseasonalizing (Differencing to Achieve Stationarity)
Differencing is a common technique to make a non-stationary time series stationary. By taking the difference between consecutive observations (or observations from the previous season), we can remove trend and seasonality.

#### First-Order Differencing (to remove trend)


In [None]:
# Apply first-order differencing to the trended series
detrended_data = trend_df['Value'].diff().dropna()
detrended_df = pd.DataFrame({'Value': detrended_data, 'Time': trend_df['Time'][1:]})

fig_detrended = px.line(detrended_df, x='Time', y='Value', title='Detrended Series (First-Order Differencing)')
fig_detrended.show()

# Perform ADF test on detrended series
result_detrended = adfuller(detrended_data)
print('\nADF Test for Detrended Series:')
print(f'ADF Statistic: {result_detrended[0]:.2f}')
print(f'P-value: {result_detrended[1]:.3f}')
if result_detrended[1] <= 0.05:
    print("Conclusion: The detrended series is stationary (reject H0).")
else:
    print("Conclusion: The detrended series is non-stationary (fail to reject H0).")


#### Seasonal Differencing (to remove seasonality)


In [None]:
# Apply seasonal differencing (assuming a 7-day seasonality for daily data)
deseasonalized_data = seasonal_df['Value'].diff(periods=7).dropna()
deseasonalized_df = pd.DataFrame({'Value': deseasonalized_data, 'Time': seasonal_df['Time'][7:]})

fig_deseasonalized = px.line(deseasonalized_df, x='Time', y='Value', title='Deseasonalized Series (Seasonal Differencing)')
fig_deseasonalized.show()

# Perform ADF test on deseasonalized series
result_deseasonalized = adfuller(deseasonalized_data)
print('\nADF Test for Deseasonalized Series:')
print(f'ADF Statistic: {result_deseasonalized[0]:.2f}')
print(f'P-value: {result_deseasonalized[1]:.3f}')
if result_deseasonalized[1] <= 0.05:
    print("Conclusion: The deseasonalized series is stationary (reject H0).")
else:
    print("Conclusion: The deseasonalized series is non-stationary (fail to reject H0).")


In [13]:
import plotly.express as px
import pandas as pd
import numpy as np 
from statsmodels.tsa.stattools import adfuller

In [14]:
data = pd.read_csv('air_passengers.csv')
data

Unnamed: 0,Month,#Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


In [15]:
def plotting(title, data, x, y, x_label, y_label):
    fig = px.line(data, x=data[x], y=data[y], labels={x: x_label, y: y_label})
    fig.show()

In [16]:
# Plot the data
plotting(title='Airline Passengers', data=data, x='Month', y='#Passengers', x_label='Date', y_label='Passengers')