In [1]:
!pip install yfinance



In [2]:
import warnings
import pandas as pd
import numpy as np
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

import plotly.express as px

import yfinance as yf

In [3]:
start_date = "2017-01-01"
end_date = "2024-01-01"
symbol = "BTC-USD"
data = yf.download(symbol, start=start_date, end=end_date)
data = data[["Close", "Volume"]]
df = data.copy()
df.head(3)

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,998.325012,147775008
2017-01-02,1021.75,222184992
2017-01-03,1043.839966,185168000


In [4]:
df.isnull().sum()

Close     0
Volume    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2556 entries, 2017-01-01 to 2023-12-31
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   2556 non-null   float64
 1   Volume  2556 non-null   int64  
dtypes: float64(1), int64(1)
memory usage: 59.9 KB




#Prepare Data

In [6]:
df['Date'] = df.index
df.head()

Unnamed: 0_level_0,Close,Volume,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,998.325012,147775008,2017-01-01
2017-01-02,1021.75,222184992,2017-01-02
2017-01-03,1043.839966,185168000,2017-01-03
2017-01-04,1154.72998,344945984,2017-01-04
2017-01-05,1013.380005,510199008,2017-01-05


In [7]:
df['Target'] = df[['Close']].shift(-1)
df.tail(3)

Unnamed: 0_level_0,Close,Volume,Date,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-12-29,42099.402344,26000021055,2023-12-29,42156.902344
2023-12-30,42156.902344,16013925945,2023-12-30,42265.1875
2023-12-31,42265.1875,16397498810,2023-12-31,


In [8]:
df = df[:-1]
df.tail(3)

Unnamed: 0_level_0,Close,Volume,Date,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-12-28,42627.855469,22992093014,2023-12-28,42099.402344
2023-12-29,42099.402344,26000021055,2023-12-29,42156.902344
2023-12-30,42156.902344,16013925945,2023-12-30,42265.1875


#Test Train Split

In [9]:
df_tts = df.copy()
df_tts = df_tts.drop(columns = ['Date'])

In [10]:
X = df_tts.iloc[:, : -1]
y = df_tts.iloc[:, -1]
X.shape, y.shape

((2555, 2), (2555,))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)

Shape of X_train:  (2044, 2)
Shape of y_train:  (2044,)


#Ridge Regression

In [12]:
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

In [13]:
y_pred = ridge_model.predict(X_test)
ridge_error = np.sqrt(mean_squared_error(y_test, y_pred))
print("mean_squared_error: ", ridge_error)

mean_squared_error:  895.8825658198903


In [14]:
predicted_prices_ridge = ridge_model.predict(X)
predicted_prices_ridge

array([ 1037.05464749,  1060.4828645 ,  1082.47297676, ...,
       42567.30685664, 42043.27998811, 42091.02592481])

In [15]:
Predicted = []
for i in predicted_prices_ridge:
  Predicted.append(i)
len(Predicted)

2555

In [16]:
close = []
for i in df["Close"]:
  close.append(i)
len(close)

2555

In [17]:
df_predicted_ridge = df[['Date']]
df_predicted_ridge['Close'] = close
df_predicted_ridge['Prediction'] = Predicted
df_predicted_ridge.head()

Unnamed: 0_level_0,Date,Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,2017-01-01,998.325012,1037.054647
2017-01-02,2017-01-02,1021.75,1060.482865
2017-01-03,2017-01-03,1043.839966,1082.472977
2017-01-04,2017-01-04,1154.72998,1193.19351
2017-01-05,2017-01-05,1013.380005,1052.413733


In [18]:
def interactive_plot(df, title):
  fig = px.line(title = title)
  for i in df.columns[1:]:
    fig.add_scatter(x = df['Date'], y = df[i], name = i)
  fig.show()

In [19]:
interactive_plot(df_predicted_ridge, "Original vs Prediction")

#XGBoost

In [20]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

In [21]:
y_pred = xgb_model.predict(X_test)
xgb_error = np.sqrt(mean_squared_error(y_test, y_pred))
print("mean_squared_error: ", xgb_error)

mean_squared_error:  1062.0533159674267


In [22]:
predicted_prices_xgb = xgb_model.predict(X)
predicted_prices_xgb

array([ 1006.6311,  1036.7964,  1041.2974, ..., 42108.293 , 41487.152 ,
       42479.51  ], dtype=float32)

In [23]:
Predicted = []
for i in predicted_prices_xgb:
  Predicted.append(i)
len(Predicted)

2555

In [24]:
close = []
for i in df["Close"]:
  close.append(i)
len(close)

2555

In [25]:
df_predicted_xgb = df[['Date']]
df_predicted_xgb['Close'] = close
df_predicted_xgb['Prediction'] = Predicted
df_predicted_xgb.head()

Unnamed: 0_level_0,Date,Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,2017-01-01,998.325012,1006.631104
2017-01-02,2017-01-02,1021.75,1036.796387
2017-01-03,2017-01-03,1043.839966,1041.297363
2017-01-04,2017-01-04,1154.72998,1146.739746
2017-01-05,2017-01-05,1013.380005,1012.94928


In [26]:
interactive_plot(df_predicted_xgb, "Original vs Prediction")