In [1]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime

In [2]:

# importing dataset from github into pandas dataframe
df = pd.read_csv('https://raw.githubusercontent.com/kalakhushi18/Advance-Time-Series-Prediction-W24/main/dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3656 entries, 0 to 3655
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   date                         3656 non-null   object 
 1   bitcoin_closing_prices       3655 non-null   float64
 2   sp500_closing_data           2517 non-null   float64
 3   inflation_rate               2610 non-null   object 
 4   daily_treasury_rates         2502 non-null   float64
 5   bitcoin_daily_google_trends  3656 non-null   int64  
 6   is_holiday                   3656 non-null   int64  
 7   twitter_sentiments_score     0 non-null      float64
dtypes: float64(4), int64(2), object(2)
memory usage: 228.6+ KB


In [4]:
# changing 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# sorting the dataframe by date
df = df.sort_values(by='date')

df.head()

Unnamed: 0,date,bitcoin_closing_prices,sp500_closing_data,inflation_rate,daily_treasury_rates,bitcoin_daily_google_trends,is_holiday,twitter_sentiments_score
0,2014-11-01,325.748993,,,,40,1,
1,2014-11-02,325.891998,,,,44,1,
2,2014-11-03,327.553986,2017.810059,1.93,0.03,46,0,
3,2014-11-04,330.492004,2012.099976,1.92,0.04,43,0,
4,2014-11-05,339.485992,2023.569946,1.93,0.04,45,0,


In [5]:
# replacing '.' in 'inflation_rate' column with np.nan
df['inflation_rate'] = df['inflation_rate'].replace('.', np.nan)

In [6]:
# typecasting 'inflation_rate' column to float
df['inflation_rate'] = df['inflation_rate'].astype(float)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3656 entries, 0 to 3655
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         3656 non-null   datetime64[ns]
 1   bitcoin_closing_prices       3655 non-null   float64       
 2   sp500_closing_data           2517 non-null   float64       
 3   inflation_rate               2502 non-null   float64       
 4   daily_treasury_rates         2502 non-null   float64       
 5   bitcoin_daily_google_trends  3656 non-null   int64         
 6   is_holiday                   3656 non-null   int64         
 7   twitter_sentiments_score     0 non-null      float64       
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 228.6 KB


In [8]:
# extract 'day_of_week' from 'date' column
df['day_of_week'] = df['date'].dt.dayofweek

# extract 'day_of_month' from 'date' column
df['day_of_month'] = df['date'].dt.day

# extract 'day_of_year' from 'date' column
df['day_of_year'] = df['date'].dt.dayofyear

# extract 'month' from 'date' column
df['month'] = df['date'].dt.month

# extract 'year' from 'date' column
df['year'] = df['date'].dt.year


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3656 entries, 0 to 3655
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         3656 non-null   datetime64[ns]
 1   bitcoin_closing_prices       3655 non-null   float64       
 2   sp500_closing_data           2517 non-null   float64       
 3   inflation_rate               2502 non-null   float64       
 4   daily_treasury_rates         2502 non-null   float64       
 5   bitcoin_daily_google_trends  3656 non-null   int64         
 6   is_holiday                   3656 non-null   int64         
 7   twitter_sentiments_score     0 non-null      float64       
 8   day_of_week                  3656 non-null   int32         
 9   day_of_month                 3656 non-null   int32         
 10  day_of_year                  3656 non-null   int32         
 11  month                        3656 non-null 

In [10]:
df.head()

Unnamed: 0,date,bitcoin_closing_prices,sp500_closing_data,inflation_rate,daily_treasury_rates,bitcoin_daily_google_trends,is_holiday,twitter_sentiments_score,day_of_week,day_of_month,day_of_year,month,year
0,2014-11-01,325.748993,,,,40,1,,5,1,305,11,2014
1,2014-11-02,325.891998,,,,44,1,,6,2,306,11,2014
2,2014-11-03,327.553986,2017.810059,1.93,0.03,46,0,,0,3,307,11,2014
3,2014-11-04,330.492004,2012.099976,1.92,0.04,43,0,,1,4,308,11,2014
4,2014-11-05,339.485992,2023.569946,1.93,0.04,45,0,,2,5,309,11,2014


| Variable                  | Suggested Transformation       | Reason                                                                                                       | Code for Transformation                  |
|---------------------------|--------------------------------|--------------------------------------------------------------------------------------------------------------|------------------------------------------|
| **Bitcoin Closing Price** | Box-Cox Transformation         | Stabilizes variance and makes data more normally distributed, useful for price prediction.                    | `data['Bitcoin_Close_Box'], lmbda = stats.boxcox(data['Bitcoin_Close'])` |
| **S&P 500 Closing Data**  | Log Transformation             | Reduces skewness and stabilizes variance, ideal for high variability in financial data.                      | `data['SP500_Log'] = np.log(data['SP500_Closing'] + 1)` |
| **Inflation Rate**        | Box-Cox Transformation         | Stabilizes variance in data with potential outliers; Box-Cox is tunable with lambda.                         | `data['Inflation_Box'], lmbda = stats.boxcox(data['Inflation'] + 1)` |
| **Daily Treasury Rates**  | Square Root Transformation     | Reduces moderate skew; smooths variability without drastically changing small values.                        | `data['Treasury_Rates_Sqrt'] = np.sqrt(data['Treasury_Rates'])` |
| **Bitcoin Google Trends** | Log Transformation             | Smooths large spikes in highly variable trend data by reducing skewness.                                     | `data['Trends_Log'] = np.log(data['Google_Trends'] + 1)` |
| **Twitter Sentiment Score** | Min-Max Scaling              | Standardizes the range (e.g., 0 to 1) without changing distribution; scores are already range-bound.         | `data['Sentiment_Scaled'] = (data['Sentiment'] - data['Sentiment'].min()) / (data['Sentiment'].max() - data['Sentiment'].min())` |
| **Day Number (day_num)**  | Cyclic Encoding (Sine and Cosine) | Captures weekly cycle, where Sunday (7) and Monday (1) are adjacent; both sine and cosine are needed for full cyclic representation. | `data['day_num_sin'] = np.sin(2 * np.pi * data['day_num'] / 7)` <br> `data['day_num_cos'] = np.cos(2 * np.pi * data['day_num'] / 7)` |
| **Month**                 | One-Hot or Cyclic Encoding     | Useful for capturing month-to-month continuity (e.g., December to January); cyclic encoding is helpful if seasonality is expected. | **One-Hot**: `pd.get_dummies(data['month'], prefix='month')` <br> **Cyclic**: `data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)` <br> `data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)` |
| **Week Number (week_num)**| No transformation or Cyclic Encoding | Represents the yearly cycle; useful for capturing weekly seasonality over a year if encoded cyclically.      | `data['week_num_sin'] = np.sin(2 * np.pi * data['week_num'] / 52)` <br> `data['week_num_cos'] = np.cos(2 * np.pi * data['week_num'] / 52)` |
| **Year**                  | No transformation              | Retain as-is to represent an overall trend over time; SARIMAX can capture trends directly.                   | No transformation needed                 |
| **is_holiday**            | No transformation              | Binary variable (1/0), already suitable for modeling as a dummy variable.                                    | No transformation needed                 |
