In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os
import statsmodels 
import seaborn as sns
import scipy.stats as ss
import pylab
import pymysql
import statsmodels.graphics.tsaplots as sgt
import statsmodels.tsa.stattools as sts 
import arch
import datetime as dt


from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.ensemble import IsolationForest
from statsmodels.tsa.arima.model import ARIMA
from scipy.stats.distributions import chi2
from tqdm import tqdm
from time import sleep
import warnings
warnings.filterwarnings('ignore', category=Warning)
sns.set()
def read_csv(name:str):
    df = pd.read_csv(f'../data/processed/{name}.csv')
    df.Date = pd.to_datetime(df.Date)
    df.set_index('Date', inplace = True)
    df.asfreq('12H')
    return df
data_raw = read_csv('data_processed')
df = data_raw.copy()
df.head()
df.info()
df.describe()
df.isna().sum()
def plot(df, columns, cumsum = False):
    fig, ax = plt.subplots(figsize=(22, 7))
    for column in columns:
        if cumsum == True:
            df[column].cumsum().plot(ax=ax, label=column)
        else:
            df[column].plot(ax=ax, label=column)
    plt.title(f"Plot of {', '.join(columns)}", fontsize = 20)
    plt.legend(fontsize = 15)
    file_name = "_".join(columns) + "_plot.png"
    plt.savefig(f'../reports/figures/{file_name}')
    plt.show()

plot(df, ['Norm_BTCUSDT', 'Norm_ADAUSDT', 'Norm_BNBUSDT', 'Norm_ETHUSDT'])
df.BNBUSDT.nlargest()
plot(df, ['Ret_BTCUSDT', 'Ret_ADAUSDT', 'Ret_BNBUSDT', 'Ret_ETHUSDT'], cumsum=True)
plot(df, ['Ret_BTCUSDT'], cumsum=True)
def qqplot(df, column):
    fig, ax = plt.subplots(figsize=(12, 8))
    ss.probplot(df[column], plot=plt)
    plt.title(f"Q-Q Plot of {column}")
    plt.xlabel("Theoretical Quantiles")
    plt.ylabel("Ordered Values")
    
    
    # Generate the file name based on the column name
    file_name = f"{column}_qqplot.png"
    
    # Save the figure in the specified directory
    plt.savefig(f'../reports/figures/{file_name}')
    plt.show()


qqplot(df, 'BTCUSDT' )
qqplot(df, 'Norm_BTCUSDT')
qqplot(df, 'Ret_BNBUSDT')
sts.adfuller(df.BTCUSDT)

sts.adfuller(df.BNBUSDT)

sts.adfuller(df.ADAUSDT)

sts.adfuller(df.ETHUSDT)
def seasonal_decompose_plots(df, column):
    result = seasonal_decompose(df[column].iloc[1:], model='additive')
    
    fig, axes = plt.subplots(4, 1, figsize=(12, 8))
    
    axes[0].plot(result.observed)
    axes[0].set_ylabel('Observed')
    
    axes[1].plot(result.trend)
    axes[1].set_ylabel('Trend')
    
    axes[2].plot(result.seasonal)
    axes[2].set_ylabel('Seasonal')
    
    axes[3].plot(result.resid)
    axes[3].set_ylabel('Residual')
    
    plt.tight_layout()
    
    # Generate the file name based on the column name
    file_name = f"{column}_decompose_additive.png"
    
    # Save the figure in the specified directory
    plt.savefig(f'../reports/figures/{file_name}')
    plt.show()

seasonal_decompose_plots(df, 'BTCUSDT')
seasonal_decompose_plots(df,'BNBUSDT')
seasonal_decompose_plots(df,'ADAUSDT')
seasonal_decompose_plots(df,'ETHUSDT')

def acf_plot(df, column, lags = 40):
    fig, ax = plt.subplots(figsize=(10, 8))
    sgt.plot_acf(df[column], lags = lags, zero=False, ax=ax)
    plt.title(f"{column} ACF")
    
    # Generate the file name based on the column name
    file_name = f"{column}_acf.png"
    
    # Save the figure in the specified directory
    plt.savefig(f'../reports/figures/{file_name}')
    plt.show()

acf_plot(df, 'BNBUSDT')
acf_plot(df, 'ETHUSDT', lags = 40)

def pacf_plot(df, column):
    fig, ax = plt.subplots(figsize=(10, 8))
    sgt.plot_pacf(df[column], lags=40, zero=False, ax=ax, method='ols')
    plt.title(f"{column} PACF", size=14)
    
    # Generate the file name based on the column name
    file_name = f"{column}_pacf.png"
    
    # Save the figure in the specified directory
    plt.savefig(f'../reports/figures/{file_name}_pacf.png')
    plt.show()

pacf_plot(df, 'ADAUSDT')
pacf_plot(df, 'BNBUSDT')
pacf_plot(df, 'BTCUSDT')
def plot_correlation_heatmap(df, columns):
    selected_df = df[columns]
    
    # Compute the correlation matrix
    corr_matrix = selected_df.corr()
    
    # Create the heatmap plot
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    
    # Set the dynamic title based on the column names
    title = "Correlation Heatmap: " + ', '.join(columns)
    plt.title(title)
    
    
    file_name = "correlation_heatmap_" + '_'.join(columns) + ".png"
    plt.savefig(f'../reports/figures/{file_name}')
    
    plt.show()

plot_correlation_heatmap(df, ['BTCUSDT', 'ADAUSDT', 'ETHUSDT', 'BNBUSDT'])
plot_correlation_heatmap(df, ['Ret_BTCUSDT', 'Ret_ETHUSDT', 'Ret_BNBUSDT', 'Ret_ADAUSDT'])

def outlier_detection(df, target_column, contamination=0.1):

    model = IsolationForest(contamination=contamination, random_state=42)

    # Fit the model and predict outliers
    df.dropna(inplace = True)
    df[f'outr_{target_column}'] = model.fit_predict(df[[target_column]])



outlier_detection(df, 'BTCUSDT')
outlier_detection(df, 'BNBUSDT')
outlier_detection(df, 'ETHUSDT')
outlier_detection(df, 'ADAUSDT')


#for returns 
outlier_detection(df, 'Ret_cum_BTCUSDT')
outlier_detection(df, 'Ret_cum_ADAUSDT')
outlier_detection(df, 'Ret_cum_ETHUSDT')
outlier_detection(df, 'Ret_cum_BNBUSDT')


def plot_outliers(df, target_column):
    
    fig, ax = plt.subplots(figsize=(20, 6))

    # Plot the data
    ax.plot(df.index, df[target_column], label= target_column, c = 'green')

    # Mark the outliers
    outliers = df[df[f'outr_{target_column}'] == -1]
    ax.scatter(outliers.index, outliers[target_column], color='red', label='Outliers')

    ax.set_xlabel('Index')
    ax.set_ylabel('Value')
    ax.set_title(f'Outlier Detection for {target_column}', fontsize = 20)
    ax.legend()
    plt.savefig(f'../reports/figures/{target_column}_outliers.png')
    plt.show()



plot_outliers(df, 'BTCUSDT')

plot_outliers(df, 'Ret_cum_BTCUSDT')
plot_outliers(df, 'ADAUSDT')
plot_outliers(df, 'Ret_cum_ADAUSDT')
plot_outliers(df, 'BNBUSDT')
plot_outliers(df, 'Ret_cum_BNBUSDT')
plot_outliers(df, 'ETHUSDT')
plot_outliers(df, 'Ret_cum_ETHUSDT')
def garch_volatility(df, column):
    """
    Fit and plot the GARCH model's estimated volatility.

    Parameters:
        df (DataFrame): DataFrame containing the return series.
        column (str): Name of the column with the return series.

    """
    # Create a GARCH model
    model = arch.arch_model(df[f'Ret_{column}'], vol='Garch', p=1, q=1)

    # Fit the model
    results = model.fit(update_freq=5)

    # Print the model summary
    print(results.summary())

    # Plot the estimated volatility
    fig, ax = plt.subplots(figsize=(20, 6))
    ax.plot(results.conditional_volatility, color='blue', label=f'Estimated Volatility for {column}')
    ax.set_xlabel('Time', fontsize = 15)
    ax.set_ylabel('Volatility', fontsize = 17)
    ax.set_title(f'GARCH Estimated Volatility {column}', fontsize = 20)
    ax.legend()
    plt.savefig(f'../reports/figures/Ret_{column}_garch.png')
    plt.show()

garch_volatility(df, 'BTCUSDT')
garch_volatility(df, 'ADAUSDT')

garch_volatility(df, 'BNBUSDT')

garch_volatility(df, 'ETHUSDT')

## QQ Plot
##### The QQ Plot below indicates that values are not normally distributed

## Test of stationality
##### ADF Test Results for BTCUSDT Closing Prices

The Augmented Dickey-Fuller (ADF) test was conducted on the data. The purpose of this test is to determine whether the series is stationary or non-stationary.


- The test statistic is insignificant at the 5% level.
- These results indicate that there is insufficient evidence to reject the null hypothesis of non-stationarity.

Therefore, based on the ADF test, the data are likely non-stationary, suggesting the presence of trends or other forms of non-random behavior.
These further guides our model choice which accordling to the data below should be Arimax or sarimax


## Test of Seasonality
- This shows that the data doesnt flow a seasonal trend
- looking into the residuals it indicates the crypto boom in 2021 and beyond

### seasonal decompose plot show that there is no seasonality in the data which means that the btcusdt is not a seasonal data also the residual shows that there was some instablity in the data in 2021 up untill eaily 2022

## Plotting ACF

### This acf suggest that coefficient are signficant which shows time dependance in the data also the shows that prices 30 days ago and beyond isnt really a good indicator of current prices

## PACF

### the pacf shows that the previous lags has little to no effect on the current price 

This headmap show that the crypto currencies in view are highly corrected with each which orther which is not expected of the looking closly the ada and btcusdt are the least corroected

##### Analyising volititlity