**Import Library**

In [1]:
import pandas as pd
from pandas import read_csv
import datetime as dt

import numpy as np
from numpy import arange

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC,SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

import joblib

Load Data

In [2]:
df = pd.read_csv('./Dataset/BBCA.csv')
df = df.rename(columns={'timestamp':'Date', 'open':'Open', 'low':'Low', 'high':'High', 'close':'Close', 'volume':'Volume'})

df['Date'] = pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,Date,Open,Low,High,Close,Volume
0,2001-04-16,175,175,180,177,0
1,2001-04-17,175,175,180,177,0
2,2001-04-18,175,175,180,177,0
3,2001-04-19,175,175,180,177,0
4,2001-04-20,175,175,180,177,0


In [3]:
df.describe()

Unnamed: 0,Date,Open,Low,High,Close,Volume
count,5670,5670.0,5670.0,5670.0,5670.0,5670.0
mean,2012-02-25 12:00:00,2418.109877,2393.385538,2442.639506,2418.603175,89190480.0
min,2001-04-16 00:00:00,175.0,175.0,177.0,177.0,0.0
25%,2006-09-20 06:00:00,460.0,455.0,467.0,460.5,27190000.0
50%,2012-02-25 12:00:00,1585.0,1570.0,1600.0,1585.0,61614500.0
75%,2017-08-01 18:00:00,3743.75,3715.0,3765.0,3740.0,104548100.0
max,2023-01-06 00:00:00,9050.0,8975.0,9400.0,9300.0,1949960000.0
std,,2379.071545,2357.118837,2400.150856,2378.79815,127492500.0


**Data Cleaning**

Remove row with zero value

In [4]:
df = df[(df != 0).all(1)]
df.drop_duplicates()
df.dropna()

Unnamed: 0,Date,Open,Low,High,Close,Volume
821,2004-06-08,175,175,180,177,499150000
822,2004-06-09,177,175,182,180,294290000
823,2004-06-10,180,177,180,180,165590000
824,2004-06-11,177,177,180,180,135830000
825,2004-06-14,180,175,180,177,158540000
...,...,...,...,...,...,...
5665,2023-01-02,8575,8500,8600,8550,10653900
5666,2023-01-03,8550,8525,8600,8550,27399100
5667,2023-01-04,8525,8350,8575,8350,90918800
5668,2023-01-05,8350,8150,8375,8250,128838500


**Exploratory Data Analysis**

Moving average adalah rata rata harga saham pada periode tertentu. indikator ini sering digunakan untuk melihat pola arah tren harga suatu saham. terdapat dua jenis moving average yang sering digunakan yaitu simple moving average dan eksponensial moving average. simple moving average didapat dari rerata harga pada periode tertentu sedangkan eksponensial moving average didapat pada rerata harga terbaru.

pada code berikut kita buat simple moving average dengan periode 5, 10, 20, 50, dan 100 hari.

In [7]:
ma_day = [5, 10, 20, 50, 100]
ma = pd.DataFrame()
for n in ma_day:    
    column_name = f"MA{n}"
    ma[column_name] = df['Close'].rolling(n).mean()

Pada grafik berikut, kita tampilkan data historis saham, moving average, dan volume. data historis saham direpresentasikan menggunakan chart candlestick. setiap lilin merepresentasikan data harga harian buka, tutup, tertinggi dan terendah. ketika lilin mempunyai warna hijau artinya harga tutup lebih tinggi daripada harga buka, sedangkan lilin merah sebaliknya.

In [8]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
               vertical_spacing=0.12, subplot_titles=('BBCA', 'Volume'), 
               row_width=[0.2, 0.6])

fig.add_trace(go.Candlestick(x=df.Date, open=df.Open, low=df.Low, high=df.High, close=df.Close, showlegend=False), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=ma.MA10, name='MA10'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=ma.MA20, name='MA20'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=ma.MA50, name='MA50'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=ma.MA50, name='MA5'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=ma.MA50, name='MA100'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.Volume), row=2, col=1)
fig.update(layout_xaxis_rangeslider_visible=False)
fig.update_layout(height=800)
fig.show()

In [33]:
fig = make_subplots(rows=1, cols=2, shared_xaxes=True)

fig.add_trace(go.Box(y=df.Close, name='price'), row=1, col=1)
fig.add_trace(go.Box(y=df.Volume, name='volume'), row=1, col=2)
fig.update_layout(title='Distribusi Harga dan Volume')
fig.show()

In [32]:
bbca_resample = df.resample('M', on='Date').mean()
fig = make_subplots(rows=2, cols=1)

fig.add_trace(go.Box(x=bbca_resample.index.month, y=bbca_resample.Close, name='price'), row=1, col=1)
fig.add_trace(go.Box(x=bbca_resample.index.month, y=bbca_resample.Volume, name='volume'), row=2, col=1)
fig.update_layout(height=800, title="Distribusi Harga dan Volume per Bulan")
fig.show()

**Find Feature Correlation**

In [34]:
df_corr = df.corr()

fig = go.Figure()
fig.add_trace(go.Heatmap(x = df_corr.columns,y = df_corr.index,z = np.array(df_corr),text=df_corr.values,texttemplate='%{text:.2f}'))
fig.update_layout(title='Korelasi Fitur')
fig.show()

**Feature Selection**

Using Close as target class

In [14]:
x = df[['Open', 'Low', 'High', 'Volume']]
y = df['Close']

**Splitting dataset**

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,  random_state=0)

**Scaling**

Doing experiment with different scaling method

In [16]:
minmax = MinMaxScaler()
x_train = minmax.fit_transform(x_train)
x_test = minmax.transform(x_test)

**Random Forest Regressor**

In [36]:
rfmodel = RandomForestRegressor()
rfmodel.fit(x_train, y_train)
rfpredict = rfmodel.predict(x_test)

In [37]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.Date, y=y_test, name='Actual'))
fig.add_trace(go.Scatter(x=df.Date, y=rfpredict, name='Predicted'))
fig.update_layout(title="Prediksi Harga Saham Dataset Test")
fig.show()

**Support Vector Regression**

In [38]:
svrmodel = SVR()
svrmodel.fit(x_train, y_train)
svrpredict = svrmodel.predict(x_test)

In [39]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.Date, y=y_test, name='Actual'))
fig.add_trace(go.Scatter(x=df.Date, y=svrpredict, name='Predicted'))
fig.update_layout(title="Prediksi Harga Saham Dataset Test")
fig.show()

**Linear Regression**

In [40]:
lrmodel = LinearRegression()
lrmodel.fit(x_train, y_train)
lrpredict = lrmodel.predict(x_test)

In [41]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.Date, y=y_test, name='Actual'))
fig.add_trace(go.Scatter(x=df.Date, y=lrpredict, name='Predicted'))
fig.update_layout(title='Prediksi Harga Saham Dataset Test')
fig.show()

**Decision Tree**

In [42]:
dtmodel = DecisionTreeRegressor()
dtmodel.fit(x_train, y_train)
dtpredict = dtmodel.predict(x_test)

In [43]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.Date, y=y_test, name='Actual'))
fig.add_trace(go.Scatter(x=df.Date, y=dtpredict, name='Predicted'))
fig.update_layout(title='Prediksi Harga Saham Dataset Test')
fig.show()

In [54]:
import yfinance as yf

bbca = yf.Ticker('bbca.jk')
dfbbca = pd.DataFrame(bbca.history(period='1mo'))
x_oneMonth = dfbbca[['Open', 'Low', 'High', 'Volume']]
y_oneMonth = dfbbca['Close'] 
x_oneMonth = minmax.transform(x_oneMonth)

In [55]:
rfonemonth = rfmodel.predict(x_oneMonth)
svronemonth = svrmodel.predict(x_oneMonth)
lronemonth = lrmodel.predict(x_oneMonth)
dtonemonth = dtmodel.predict(x_oneMonth)

In [58]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=dfbbca.index, y=y_oneMonth, name='Actual'))
fig.add_trace(go.Scatter(x=dfbbca.index, y=rfonemonth, name='RF Prediction'))
fig.add_trace(go.Scatter(x=dfbbca.index, y=svronemonth, name='SVR Prediction'))
fig.add_trace(go.Scatter(x=dfbbca.index, y=lronemonth, name='LR Prediction'))
fig.add_trace(go.Scatter(x=dfbbca.index, y=dtonemonth, name='DT Prediction'))
fig.show()

**Parameter Tuning For SVR**

In [69]:
from scipy.stats import randint as sp_randint
import scipy.stats as stats
rf_params = {
    'C': stats.uniform(0,50),
    "kernel":['poly','rbf','sigmoid'],
    "epsilon":stats.uniform(0,1)
}
n_iter_search=20
clf = SVR(gamma='scale')
rscv = RandomizedSearchCV(clf, param_distributions=rf_params,n_iter=n_iter_search,cv=3,scoring='neg_mean_squared_error')
rscv.fit(x_train, y_train)
print(rscv.best_params_)

{'C': 47.878637725622134, 'epsilon': 0.24681243881075166, 'kernel': 'rbf'}


In [70]:
svrmodel = SVR(C=47.878637725622134, epsilon=0.24681243881075166, kernel='rbf')
svrmodel.fit(x_train, y_train)
svrpredict = svrmodel.predict(x_test)

In [71]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.Date, y=y_test, name='Actual'))
fig.add_trace(go.Scatter(x=df.Date, y=svrpredict, name='Predicted'))
fig.update_layout(title="Prediksi Harga Saham Dataset Test")
fig.show()

In [72]:
svronemonth = svrmodel.predict(x_oneMonth)

In [73]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=dfbbca.index, y=y_oneMonth, name='Actual'))
fig.add_trace(go.Scatter(x=dfbbca.index, y=rfonemonth, name='RF Prediction'))
fig.add_trace(go.Scatter(x=dfbbca.index, y=svronemonth, name='SVR Prediction'))
fig.add_trace(go.Scatter(x=dfbbca.index, y=lronemonth, name='LR Prediction'))
fig.add_trace(go.Scatter(x=dfbbca.index, y=dtonemonth, name='DT Prediction'))
fig.show()

**Evaluation Metrics**

In [93]:
preds = [rfpredict, svrpredict, lrpredict, dtpredict]
models = [rfmodel, svrmodel, lrmodel, dtmodel]
nm = ['Random Forest', 'SVR', 'Linear Regression', 'Decision Tree']
val = pd.DataFrame()
for a,b,c in zip(nm, preds, models):
    mea = round(metrics.mean_absolute_error(y_test, b), 4)
    mse = round(metrics.mean_squared_error(y_test, b), 4)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test, b)), 4)
    r2score = round(metrics.r2_score(y_test, b), 4)
    train_score = c.score(x_train, y_train) * 100
    test_score = c.score(x_test, y_test) * 100
    errors = abs(b - y_test)
    mape = 100 * (errors / y_test)
    accuracy = 100 - np.mean(mape)
    acc = round(accuracy, 2)
    data = {
        'name' : a, 
        'mea': mea, 
        'mse': mse, 
        'rmse': rmse, 
        'r2': r2score, 
        'train_score': train_score, 
        'test_score': test_score, 
        'accuracy': acc
    }
    val = pd.concat([val, pd.DataFrame(data, index=[0])], ignore_index=True) 
val = val.set_index(['name'])
val

Unnamed: 0_level_0,mea,mse,rmse,r2,train_score,test_score,accuracy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Random Forest,16.2541,1022.9928,31.9843,0.9998,99.997453,99.981279,99.42
SVR,46.3173,18935.4921,137.6063,0.9965,99.585116,99.653481,95.7
Linear Regression,16.4978,708.6328,26.6202,0.9999,99.986152,99.987032,99.25
Decision Tree,20.3443,1598.581,39.9823,0.9997,100.0,99.970746,99.28


In [95]:
nm = ['Random Forest.sav', 'SVR.sav', 'Linear Regression.sav', 'Decision Tree.sav']
for a,b in zip(nm, models):
    joblib.dump(b, a)