In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import pandas as pd

Disini kami menggunakan dataset harga bitcoin dalam bentuk USD ($) dari tanggal 1 maret sampai 30 april dengan variabel independent nya adalah

1. Date
2. High (harga tertinggi pada hari itu)
3. Low (harga terendah pada hari itu)
4. Volume (total perdagangan pada hari itu)
5. Weekend (hari libur biasa / nasional)
6. Good News (berita baik mengenai harga bitcoin)
7. Bad News (berita buruk mengenai harga bitcoin)
8. Price (harga dari bitcoin)

dan terdapat 2 variabel catatan saja, yaitu

1. Link Good News
2. Link Bad News

In [2]:
dataset = pd.read_csv("dataset project.csv")

dataset

Unnamed: 0,Date,High,Low,Volume,Weekend,Good News,Link Good News,Bad News,Link Bad News,Price
0,3/1/2021 0:00,49784.01563,45115.09375,53891300112,0,0,,0,,49631.24219
1,3/2/2021 0:00,50127.51172,47228.84375,47530897720,0,0,,0,,48378.98828
2,3/3/2021 0:00,52535.13672,48274.32031,53220811975,0,0,,0,,50538.24219
3,3/4/2021 0:00,51735.08984,47656.92969,52343816680,0,0,,0,,48561.16797
4,3/5/2021 0:00,49396.42969,46542.51563,48625928883,0,0,,0,,48927.30469
...,...,...,...,...,...,...,...,...,...,...
56,4/26/2021 0:00,55416.96484,53319.18750,49448222757,0,0,,0,,55033.11719
57,4/27/2021 0:00,56227.20703,53887.91797,48000572955,0,0,,1,https://market.bisnis.com/read/20210427/94/138...,54824.70313
58,4/28/2021 0:00,55115.84375,52418.02734,46088929780,0,0,,0,,53555.10938
59,4/29/2021 0:00,57900.71875,53129.60156,52395931985,0,0,,1,https://badcryptopodcast.com/2021/04/29/bitcoi...,57750.17578


Berikut kami melakukan set index column dari DataFrame menjadi tanggal/date dari pergerakan bitcoin dan juga drop variabel catatan (link website untuk good and bad news)

In [3]:
dataset = dataset.set_index('Date')
dataset = dataset.drop("Link Good News", axis = 1)
dataset = dataset.drop("Link Bad News", axis = 1)

dataset

Unnamed: 0_level_0,High,Low,Volume,Weekend,Good News,Bad News,Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3/1/2021 0:00,49784.01563,45115.09375,53891300112,0,0,0,49631.24219
3/2/2021 0:00,50127.51172,47228.84375,47530897720,0,0,0,48378.98828
3/3/2021 0:00,52535.13672,48274.32031,53220811975,0,0,0,50538.24219
3/4/2021 0:00,51735.08984,47656.92969,52343816680,0,0,0,48561.16797
3/5/2021 0:00,49396.42969,46542.51563,48625928883,0,0,0,48927.30469
...,...,...,...,...,...,...,...
4/26/2021 0:00,55416.96484,53319.18750,49448222757,0,0,0,55033.11719
4/27/2021 0:00,56227.20703,53887.91797,48000572955,0,0,1,54824.70313
4/28/2021 0:00,55115.84375,52418.02734,46088929780,0,0,0,53555.10938
4/29/2021 0:00,57900.71875,53129.60156,52395931985,0,0,1,57750.17578


Memasukan nilai X (independent variable) dan nilai Y (dependent variable)

In [4]:
y = dataset["Price"]
x = dataset.drop("Price", axis = 1)

print(x)
print(y)

                       High          Low  ...  Good News  Bad News
Date                                      ...                     
3/1/2021 0:00   49784.01563  45115.09375  ...          0         0
3/2/2021 0:00   50127.51172  47228.84375  ...          0         0
3/3/2021 0:00   52535.13672  48274.32031  ...          0         0
3/4/2021 0:00   51735.08984  47656.92969  ...          0         0
3/5/2021 0:00   49396.42969  46542.51563  ...          0         0
...                     ...          ...  ...        ...       ...
4/26/2021 0:00  55416.96484  53319.18750  ...          0         0
4/27/2021 0:00  56227.20703  53887.91797  ...          0         1
4/28/2021 0:00  55115.84375  52418.02734  ...          0         0
4/29/2021 0:00  57900.71875  53129.60156  ...          0         1
4/30/2021 0:00  58448.33984  57052.27344  ...          0         0

[61 rows x 6 columns]
Date
3/1/2021 0:00     49631.24219
3/2/2021 0:00     48378.98828
3/3/2021 0:00     50538.24219
3/4/2021 0:

Lakuin scaling menggunakan MinMaxScaler, yaitu suatu cara untuk membuat numerical data pada dataset memiliki rentang nilai (scale) yang sama.

In [5]:
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

Split dataset training dan testing dengan training (80%) dan testing (20%)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

Disini lakukan training model menggunakan x_train dan y_train, lalu cek coefficient dan MSE, lalu cek score dengan menggunakan data testing, lalu berhasil mendapatkan score sebesar 96.08%

In [7]:
model = LinearRegression()
model = model.fit(x_train, y_train)
prediction = model.predict(x_test)
print(f"Coef: {model.coef_}")
print(f"MSE: {mean_squared_error(y_test, prediction)}")
print("==================================================================")
print(f"Score: {model.score(x_test, y_test)*100:.2f}%")

Coef: [10721.84836692  4920.9520934  -1642.23668641    68.1695653
   307.8458196    877.37290731]
MSE: 417468.9971194613
Score: 97.65%
