In [None]:
!pip install yfinance



In [None]:
import warnings
import pandas as pd
import numpy as np
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

import plotly.express as px

import yfinance as yf

In [None]:
start_date = "2017-01-01"
end_date = "2023-01-01"
symbol = "BTC-USD"
data = yf.download(symbol, start=start_date, end=end_date)
data = data[["Open", "High", "Low", "Close", "Volume"]]
df = data.copy()
df.head(3)

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01,963.65802,1003.080017,958.698975,998.325012,147775008
2017-01-02,998.617004,1031.390015,996.702026,1021.75,222184992
2017-01-03,1021.599976,1044.079956,1021.599976,1043.839966,185168000


In [None]:
df.isnull().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2191 entries, 2017-01-01 to 2022-12-31
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    2191 non-null   float64
 1   High    2191 non-null   float64
 2   Low     2191 non-null   float64
 3   Close   2191 non-null   float64
 4   Volume  2191 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 102.7 KB


In [None]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,2191.0,2191.0,2191.0,2191.0,2191.0
mean,17612.752266,18058.205747,17113.5971,17617.805157,22566310000.0
std,16747.70845,17176.684508,16246.748388,16739.789974,20281960000.0
min,775.177979,823.307007,755.755981,777.757019,60851700.0
25%,6418.099854,6536.165039,6318.800049,6417.987549,5303753000.0
50%,9664.904297,9834.716797,9460.571289,9665.533203,19745230000.0
75%,23692.820312,24162.116211,23078.830078,23696.078125,33176590000.0
max,67549.734375,68789.625,66382.0625,67566.828125,350967900000.0




#Prepare Data

In [None]:
def clean_data(df):
    return pd.DataFrame({'Date': df.index, 'Close': df['Close'], 'Volume': df['Volume']})

In [None]:
df = clean_data(df)
df.head()

Unnamed: 0_level_0,Date,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,2017-01-01,998.325012,147775008
2017-01-02,2017-01-02,1021.75,222184992
2017-01-03,2017-01-03,1043.839966,185168000
2017-01-04,2017-01-04,1154.72998,344945984
2017-01-05,2017-01-05,1013.380005,510199008


In [None]:
df['Target'] = df[['Close']].shift(-1)
df.head()

Unnamed: 0_level_0,Date,Close,Volume,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,2017-01-01,998.325012,147775008,1021.75
2017-01-02,2017-01-02,1021.75,222184992,1043.839966
2017-01-03,2017-01-03,1043.839966,185168000,1154.72998
2017-01-04,2017-01-04,1154.72998,344945984,1013.380005
2017-01-05,2017-01-05,1013.380005,510199008,902.200989


In [None]:
# Remove the last row as it will be a null value
df = df[:-1]
df.head()

Unnamed: 0_level_0,Date,Close,Volume,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,2017-01-01,998.325012,147775008,1021.75
2017-01-02,2017-01-02,1021.75,222184992,1043.839966
2017-01-03,2017-01-03,1043.839966,185168000,1154.72998
2017-01-04,2017-01-04,1154.72998,344945984,1013.380005
2017-01-05,2017-01-05,1013.380005,510199008,902.200989


#Test Train Split

In [None]:
df_tts = df.copy()
df_tts = df_tts.drop(columns = ['Date'])

In [None]:
X = df_tts.iloc[:, : -1]
y = df_tts.iloc[:, -1]
X.shape, y.shape

((2190, 2), (2190,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)

Shape of X_train:  (1752, 2)
Shape of y_train:  (1752,)


#Ridge Regression

In [None]:
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

In [None]:
y_pred = ridge_model.predict(X_test)
ridge_error = np.sqrt(mean_squared_error(y_test, y_pred))
print("mean_squared_error: ", ridge_error)

mean_squared_error:  904.5385939073617


In [None]:
# Make Prediction
predicted_prices = ridge_model.predict(X)
predicted_prices

array([ 1025.0041899 ,  1048.45746767,  1070.45373415, ...,
       16561.47979201, 16648.19223293, 16610.16640818])

In [None]:
# Append the predicted values into a list
Predicted = []
for i in predicted_prices:
  Predicted.append(i)
len(Predicted)

2190

In [None]:
# Append the close values to the list
close = []
for i in df["Close"]:
  close.append(i)
len(close)

2190

In [None]:
# Create a dataframe based on the dates
df_predicted = df[['Date']]
df_predicted.head()

Unnamed: 0_level_0,Date
Date,Unnamed: 1_level_1
2017-01-01,2017-01-01
2017-01-02,2017-01-02
2017-01-03,2017-01-03
2017-01-04,2017-01-04
2017-01-05,2017-01-05


In [None]:
# Add the close values to the dataframe
df_predicted['Close'] = close
df_predicted.head()

Unnamed: 0_level_0,Date,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,2017-01-01,998.325012
2017-01-02,2017-01-02,1021.75
2017-01-03,2017-01-03,1043.839966
2017-01-04,2017-01-04,1154.72998
2017-01-05,2017-01-05,1013.380005


In [None]:
# Add the predicted values to the dataframe
df_predicted['Prediction'] = Predicted
df_predicted.head()

Unnamed: 0_level_0,Date,Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,2017-01-01,998.325012,1025.00419
2017-01-02,2017-01-02,1021.75,1048.457468
2017-01-03,2017-01-03,1043.839966,1070.453734
2017-01-04,2017-01-04,1154.72998,1181.261517
2017-01-05,2017-01-05,1013.380005,1040.430682


In [None]:
# Function to plot interactive plots using Plotly Express
def interactive_plot(df, title):
  fig = px.line(title = title)
  for i in df.columns[1:]:
    fig.add_scatter(x = df['Date'], y = df[i], name = i)
  fig.show()

In [None]:
# Plot the results
interactive_plot(df_predicted, "Original vs Prediction")

#XGBoost

In [None]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)
xgb_error = np.sqrt(mean_squared_error(y_test, y_pred))
print("mean_squared_error: ", xgb_error)

mean_squared_error:  1221.1655406932778


In [None]:
# Make Prediction
predicted_prices = xgb_model.predict(X)
predicted_prices

array([ 1012.5506,  1040.4333,  1045.1437, ..., 16582.217 , 16911.982 ,
       16363.036 ], dtype=float32)

In [None]:
# Append the predicted values into a list
Predicted = []
for i in predicted_prices:
  Predicted.append(i)
len(Predicted)

2190

In [None]:
# Append the close values to the list
close = []
for i in df["Close"]:
  close.append(i)
len(close)

2190

In [None]:
# Create a dataframe
df_predicted = df[['Date']]
df_predicted['Close'] = close
df_predicted['Prediction'] = Predicted
df_predicted.head()

Unnamed: 0_level_0,Date,Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,2017-01-01,998.325012,1012.550598
2017-01-02,2017-01-02,1021.75,1040.43335
2017-01-03,2017-01-03,1043.839966,1045.143677
2017-01-04,2017-01-04,1154.72998,1115.032715
2017-01-05,2017-01-05,1013.380005,1016.309875


In [None]:
# Plot the results
interactive_plot(df_predicted, "Original vs Prediction")