In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, date
import seaborn as sns

# Load Data
* Handle missing values, duplicated values, outlier

In [2]:
total = pd.read_csv("/Users/ibulmnie/Documents/GitHub/ML20222.PredictionBitcoin/data/saved_data.csv")
total['Date'] = pd.to_datetime(total['Date'])

df = total.set_index('Date')
df.head()

In [3]:
df.shape

In [4]:
df.info()

In [5]:
df.describe()

In [6]:
sns.heatmap(df.corr(), cmap="RdBu")

# Slpit Data (Testing, Training Data Sets)

In [7]:
from datetime import datetime

for index in total.index:
    total.loc[index, "Date"] = datetime.strptime(str(total.loc[index, "Date"])[:10], '%Y-%m-%d').date()

Start_day = date(2015, 12, 30)
Test_day = date(2018,4,1)
#chon ra 20-25% data trong giai doan 1
End_day = date(2018,9,30)
# train, test
total = total[(total["Date"] >= Start_day) & (total["Date"] <= End_day)].reset_index(drop = True)
train_dataset = total[total["Date"] < Test_day].reset_index(drop = True)
test_dataset = total[total["Date"] >= Test_day].reset_index(drop = True)


In [8]:
X_train = train_dataset.drop(["Date"], axis=1)[:-1]
y_train = train_dataset["BTC_close"][1:].reset_index(drop=True)

X_test = test_dataset.drop(["Date"], axis=1)[:-1]
y_test = test_dataset["BTC_close"][1:].reset_index(drop=True)

In [9]:
print("Kích thước X_train:", X_train.shape)
print("Kích thước y_train:", y_train.shape)
print("Kích thước X_test:", X_test.shape)
print("Kích thước y_test:", y_test.shape)

# Load Model

In [10]:
#import sklearn modules
import time
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

In [11]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Define the range of values for n_estimators
n_estimators = [int(x) for x in np.linspace(start=10, stop=1000, num=100)]
max_depth = [int(x) for x in np.linspace(2, 20, num = 10)]
max_features = ['sqrt', 'log2', None]
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'bootstrap': bootstrap}

print(random_grid)

# Training and Evaluating

In [12]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [13]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 500, cv = 5, verbose=2, random_state=42, n_jobs = -1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
print ('Best Parameters: ', rf_random.best_params_)

## Using the best parameters


In [None]:
start_time = time.time()

randmf = RandomForestRegressor(**rf_random.best_params_) 
randmf.fit( X_train, y_train) 

end_time = time.time()
all_run_time = end_time - start_time

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = randmf.predict(X_test)

plt.figure(figsize=(5, 7))

sns.kdeplot(y_test, color="r", label="Actual Value")
sns.kdeplot(y_pred, color="b", label="Fitted Values")

plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price')
plt.ylabel('Density')
plt.legend()
plt.show()
plt.close()


In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
import numpy as np
y_train_pred = randmf.predict(X_train)

y_test = np.array(y_test)

def AUC(y_test, y_pred):
    count = 0
    for i in range(1,len(y_test)):
        if (y_test[i] - y_test[i-1]) * (y_pred[i] - y_pred[i-1]) > 0:
            count += 1
    return count/(len(y_test)-1)
print("Test accuracy for train set")
#RMSE
print("Root Mean Square Error (RMSE):", np.sqrt(mean_squared_error(y_train, y_train_pred)))

#MAPE
print("Mean Absolute Percentage Error (MAPE):", mean_absolute_percentage_error(y_train,y_train_pred))
print()

print("Test accuracy for test set")
#RMSE
rmse_all = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Square Error (RMSE):", rmse_all)

#MAPE
mape_all = mean_absolute_percentage_error(y_test, y_pred)
print(" Mean Absolute Percentage Error (MAPE):", mape_all)
print()
AUC_all = AUC(y_test, y_pred)
#AUC
print("AUC test:", AUC_all)

## Feature Reduction

In [None]:
features = X_train.columns
# Get numerical feature importances
importances = list(randmf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

In [None]:
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2)
# Tick labels for x axis
plt.xticks(x_values, features, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');

In [None]:
# List of features sorted from most to least important
sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)
# Make a line graph
plt.plot(x_values, cumulative_importances, 'g-')
# Draw line at 95% of importance retained
plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')
# Format x ticks and labels
plt.xticks(x_values, sorted_features, rotation = 'vertical')
# Axis labels and title
plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');

In [None]:
# Find number of features for cumulative importance of 95%
# Add 1 because Python is zero-indexed
num = np.where(cumulative_importances > 0.95)[0][0] + 1
print('Number of features for 95% importance:', num)

In [None]:
# Extract the names of the most important features
important_feature_names = [feature[0] for feature in feature_importances[0:num]]
print(important_feature_names)

In [None]:
train_data = X_train[important_feature_names]
test_data = X_test[important_feature_names]
# Sanity check on operations
print('Important train features shape:', train_data.shape)
print('Important test features shape:', test_data.shape)


#### Training and Evaluating on Important Features


In [None]:
start_time = time.time()

# Train the expanded model on only the important features
randmf.fit(train_data, y_train);

# Make predictions on test data
predictions = randmf.predict(test_data)

end_time = time.time()
reduce_run_time = end_time - start_time

#RMSE
from sklearn.metrics import mean_squared_error
rmse_reduce = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE:", rmse_reduce)
print()

#MAPE
mape_reduce = np.average(np.abs((y_test - predictions) / y_test))
print("MAPE:", mape_reduce)
print()

AUC_reduce = AUC(np.array(y_test), predictions)
print("AUC test:", AUC_reduce )

#### Normalization with original data

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Normalize the features using MinMaxScaler for the training dataset
scaler = MinMaxScaler()
X_train_normalized_all = scaler.fit_transform(X_train)

# Normalize the features using the same scaler for the testing dataset
X_test_normalized_all = scaler.transform(X_test)

In [None]:
rf_regressor = RandomForestRegressor(**rf_random.best_params_)

start_time = time.time()
rf_regressor.fit(X_train_normalized_all, y_train)

y_pred_norma_all = rf_regressor.predict(X_test_normalized_all)

end_time = time.time()
normal_all_run_time = end_time - start_time

from sklearn.metrics import mean_squared_error

#RMSE
from sklearn.metrics import mean_squared_error
rmse_normal_all = np.sqrt(mean_squared_error(y_test, y_pred_norma_all))
print("RMSE:", rmse_normal_all)
print()

#MAPE
mape_normal_all = np.average(np.abs((y_test - y_pred_norma_all) / y_test))
print("MAPE:", mape_normal_all)
print()

AUC_normal_all = AUC(np.array(y_test), y_pred_norma_all)
print("AUC test:", AUC_normal_all )

#### Normalization with Importance Features

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Normalize the features using MinMaxScaler for the training dataset
scaler = MinMaxScaler()
X_train_normalized_reduce = scaler.fit_transform(train_data)

# Normalize the features using the same scaler for the testing dataset
X_test_normalized_reduce = scaler.transform(test_data)

In [None]:
rf_regressor = RandomForestRegressor(**rf_random.best_params_)

start_time = time.time()
rf_regressor.fit(X_train_normalized_reduce, y_train)

y_pred_norma_reduce = rf_regressor.predict(X_test_normalized_reduce)

end_time = time.time()
normal_reduce_run_time = end_time - start_time

from sklearn.metrics import mean_squared_error

#RMSE
from sklearn.metrics import mean_squared_error
rmse_normal_reduce = np.sqrt(mean_squared_error(y_test, y_pred_norma_reduce))
print("RMSE:", rmse_normal_reduce)
print()

#MAPE
mape_normal_reduce = np.average(np.abs((y_test - y_pred_norma_reduce) / y_test))
print("MAPE:", mape_normal_reduce)
print()

AUC_normal_reduce = AUC(np.array(y_test), y_pred_norma_reduce)
print("AUC test:", AUC_normal_reduce)

## Summary statistical table

In [None]:
def print_results(AUC_all, rmse_all, mape_all, all_run_time, AUC_reduce, rmse_reduce, mape_reduce, reduce_run_time, 
                  AUC_normal_all, rmse_normal_all, mape_normal_all, normal_all_run_time,
                 AUC_normal_reduce, rmse_normal_reduce, mape_normal_reduce, normal_reduce_run_time):
    headers = ['Type', 'Number of Features','Accuracy', 'RMSE', 'MAPE', 'Run Time (s)']
    all_results = [['All', len(importances), AUC_all, rmse_all, mape_all, all_run_time],
                   ['Reduce', len(important_feature_names), AUC_reduce, rmse_reduce, mape_reduce, reduce_run_time],
                   ['Normalization - All', len(importances), AUC_normal_all, rmse_normal_all, mape_normal_all, normal_all_run_time],
                  ['Normalization - Reduce', len(important_feature_names), AUC_normal_reduce, rmse_normal_reduce, mape_normal_reduce, normal_reduce_run_time]]

    # Calculate the maximum width for each column
    col_widths = [max(len(str(row[i])) for row in all_results + [headers]) for i in range(len(headers))]

    # Print table headers
    header_format = '  '.join(f"{{:<{width}}}" for width in col_widths)
    print(header_format.format(*headers))

    # Print separator row
    separator = '-' * (sum(col_widths) + 3 * (len(col_widths) - 1))
    print(separator)

    # Print table rows
    row_format = '  '.join(f"{{:<{width}}}" for width in col_widths)
    for result in all_results:
        index, num_features, AUC, rmse, mape, run_time = result
        print(row_format.format(index, num_features, AUC, rmse, mape, run_time))
        
print_results(AUC_all, rmse_all, mape_all, all_run_time, AUC_reduce, rmse_reduce, mape_reduce, reduce_run_time,
             AUC_normal_all, rmse_normal_all, mape_normal_all, normal_all_run_time,
             AUC_normal_reduce, rmse_normal_reduce, mape_normal_reduce, normal_reduce_run_time)


# Graph Predicted Values with Test Set

In [None]:

#hien thi ket qua du doan
fig, ax = plt.subplots(1, 1, figsize=(14, 5))
ax.plot(y_test, color = 'red', label="Bitcoin Price")
ax.plot(y_pred, color = 'green', label="Predicted Bitcoin Price", linestyle="dashed")
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))  # .3f
plt.title("Random Forest Regression for Period 1")
plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(25, 5))

ax.plot(total['Date'], total['BTC_close'], color='red', label="Bitcoin Price")
ax.plot(total['Date'][-len(y_train):], y_train, color='blue', label="Training Data")
ax.plot(total['Date'][:len(y_test)], y_test, color='orange', label="Test Data")
ax.plot(total['Date'][:len(y_pred)], y_pred, color='green', label="Predicted Bitcoin Price", linestyle="dashed")
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x)))) # Định dạng đường trục y
plt.legend()
plt.show()
