In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
df = pd.read_csv("./Data/AAPL.xls")

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100323,469033600
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.095089,175884800
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.08811,105728000
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.090291,86441600
4,1980-12-18,0.118862,0.11942,0.118862,0.118862,0.092908,73449600


In [4]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
10404,2022-03-18,160.509995,164.479996,159.759995,163.979996,163.979996,123351200
10405,2022-03-21,163.509995,166.350006,163.009995,165.380005,165.380005,95811400
10406,2022-03-22,165.509995,169.419998,164.910004,168.820007,168.820007,81532000
10407,2022-03-23,167.990005,172.639999,167.649994,170.210007,170.210007,98062700
10408,2022-03-24,171.059998,174.139999,170.210007,174.070007,174.070007,90018700


In [5]:
# EDA
print(df.info())
print(df.describe())
print(df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10409 entries, 0 to 10408
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       10409 non-null  object 
 1   Open       10409 non-null  float64
 2   High       10409 non-null  float64
 3   Low        10409 non-null  float64
 4   Close      10409 non-null  float64
 5   Adj Close  10409 non-null  float64
 6   Volume     10409 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 569.4+ KB
None
               Open          High           Low         Close     Adj Close  \
count  10409.000000  10409.000000  10409.000000  10409.000000  10409.000000   
mean      13.959910     14.111936     13.809163     13.966757     13.350337   
std       30.169244     30.514878     29.835055     30.191696     29.911132   
min        0.049665      0.049665      0.049107      0.049107      0.038384   
25%        0.281964      0.287946      0.274554      0.281250      0

In [6]:
df = df.set_index("Date") # Set Date column as the index for time series operation

In [None]:
# Plot Close price
plt.figure(figsize=(12, 6))
plt.plot(df['Close'])
plt.title('AAPL Close Price Over Time')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.show()

In [None]:
simple_ma = df["Close"].rolling(window=100).mean()

plt.figure(figsize=(12,6))
simple_ma.plot(label="Simple Moving Average")
df["Close"].plot(label="Closing Price")
plt.xticks(rotation=0)
plt.title("Moving Average of Closing Price", size=12)
plt.legend()
plt.show()

In [None]:
results = seasonal_decompose(df["Close"], model="multiplicative", period=252)
fig = results.plot()
fig.set_size_inches(12, 6)
fig.tight_layout()
plt.show()

In [None]:
# Check stationarity for Close
result = adfuller(df['Close'])
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

In [None]:
# Non stationary series as p > 0.05 
# Create differenced series for ARIMA
df['Close_diff'] = df['Close'].diff()

In [None]:
result = adfuller(df['Close_diff'].dropna())
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

In [None]:
# Time-based split: 70% train, 15% dev, 15% test
train_size = int(len(df) * 0.7)
dev_size = int(len(df) * 0.15)
test_size = len(df) - train_size - dev_size

train = df.iloc[:train_size]
dev = df.iloc[train_size:train_size + dev_size]
test = df.iloc[train_size + dev_size:]



print(f'Train: {len(train)}, Dev: {len(dev)}, Test: {len(test)}')

In [None]:
from statsmodels.tsa.arima.model import ARIMA

arima_model = ARIMA(train['Close'], order=(0, 1, 0)).fit()
print(arima_model.summary())

# Forecast on dev set for tuning
arima_forecast_dev = arima_model.forecast(steps=len(dev))