In [205]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, date
import seaborn as sns

# Load Data
* Handle missing values, duplicated values, outlier

In [206]:
total = pd.read_csv("https://raw.githubusercontent.com/lavibula/ML20222.PredictionBitcoin/main/data/saved_data.csv")
total['Date'] = pd.to_datetime(total['Date'])

df = total.set_index('Date')
df.head()

In [207]:
df.shape

In [208]:
df.info()

In [209]:
df.describe()

In [210]:
sns.heatmap(df.corr(), cmap="RdBu")

# Slpit Data (Testing, Training Data Sets)

In [211]:
from datetime import datetime

for index in total.index:
    total.loc[index, "Date"] = datetime.strptime(str(total.loc[index, "Date"])[:10], '%Y-%m-%d').date()

Start_day = date(2018, 10, 1)
Test_day = date(2022,8,1)
#chon ra 20-25% data trong giai doan 1
End_day = date(2023,4,16)
# train, test
total = total[(total["Date"] >= Start_day) & (total["Date"] <= End_day)].reset_index(drop = True)
train_dataset = total[total["Date"] < Test_day].reset_index(drop = True)
test_dataset = total[total["Date"] >= Test_day].reset_index(drop = True)


In [212]:
X_train = train_dataset.drop(["Date"], axis=1)[:-1]
y_train = train_dataset["BTC_close"][1:].reset_index(drop=True)

X_test = test_dataset.drop(["Date"], axis=1)[:-1]
y_test = test_dataset["BTC_close"][1:].reset_index(drop=True)

In [213]:
test_ratio = len(test_dataset) / len(total)

print("Tỉ lệ test_data/total:", test_ratio)

In [214]:
print("Kích thước X_train:", X_train.shape)
print("Kích thước y_train:", y_train.shape)
print("Kích thước X_test:", X_test.shape)
print("Kích thước y_test:", y_test.shape)

# Load Model

In [215]:
#import sklearn modules
import time
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

In [216]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

## Default

In [217]:
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

In [218]:
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
import numpy as np
y_train_pred = rf.predict(X_train)

y_test = np.array(y_test)

def AUC(y_test, y_pred):
    count = 0
    for i in range(1,len(y_test)):
        if (y_test[i] - y_test[i-1]) * (y_pred[i] - y_pred[i-1]) > 0:
            count += 1
    return count/(len(y_test)-1)
print("Test accuracy for train set")
#RMSE
print("Root Mean Square Error (RMSE):", np.sqrt(mean_squared_error(y_train, y_train_pred)))

#MAPE
print("Mean Absolute Percentage Error (MAPE):", mean_absolute_percentage_error(y_train,y_train_pred))
print()

print("Test accuracy for test set")
#RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Square Error (RMSE):", rmse)

#MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)
print(" Mean Absolute Percentage Error (MAPE):", mape)
print()
AUC = AUC(y_test, y_pred)
#AUC
print("AUC test:", AUC)

## Tuning 

In [219]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Define the range of values for n_estimators
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
max_depth = [int(x) for x in np.linspace(2, 10, num = 5)]
max_features = [None]
bootstrap = [False] # method used to sample data points

param_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'bootstrap': bootstrap}

print(param_grid)

### Randomized Search

In [220]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

scoring = make_scorer(mean_squared_error, greater_is_better=False)

# Create the RandomizedSearchCV object with early stopping
rf_random = RandomizedSearchCV(estimator=rf, param_distributions = param_grid, scoring=scoring,
                                   cv=200, refit=True, verbose=2,
                                   n_jobs = -1, random_state=42)

In [221]:
rf_random.fit(X_train, y_train)

print ('Best Parameters: ', rf_random.best_params_)


## Using the best parameters

### Randomized Search

In [None]:
start_time = time.time()

randmf = RandomForestRegressor(**rf_random.best_params_) 
randmf.fit(X_train, y_train) 

end_time = time.time()
all_rand_run_time = end_time - start_time

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

y_pred_rand = randmf.predict(X_test)

plt.figure(figsize=(5, 7))

sns.kdeplot(y_test, color="r", label="Actual Value")
sns.kdeplot(y_pred_rand, color="b", label="Fitted Values")

plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price')
plt.ylabel('Density')
plt.legend()
plt.show()
plt.close()


In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
import numpy as np
y_train_rand_pred = randmf.predict(X_train)

y_test = np.array(y_test)

def AUC(y_test, y_pred):
    count = 0
    for i in range(1,len(y_test)):
        if (y_test[i] - y_test[i-1]) * (y_pred[i] - y_pred[i-1]) > 0:
            count += 1
    return count/(len(y_test)-1)
print("Test accuracy for train set")
#RMSE
print("Root Mean Square Error (RMSE):", np.sqrt(mean_squared_error(y_train, y_train_rand_pred)))

#MAPE
print("Mean Absolute Percentage Error (MAPE):", mean_absolute_percentage_error(y_train,y_train_rand_pred))
print()

print("Test accuracy for test set")
#RMSE
rmse_all_rand = np.sqrt(mean_squared_error(y_test, y_pred_rand))
print("Root Mean Square Error (RMSE):", rmse_all_rand)

#MAPE
mape_all_rand = mean_absolute_percentage_error(y_test, y_pred_rand)
print(" Mean Absolute Percentage Error (MAPE):", mape_all_rand)
print()
AUC_all_rand = AUC(y_test, y_pred_rand)
#AUC
print("AUC test:", AUC_all_rand)



## Feature Reduction

### Randomized Search

In [None]:
features_rand = X_train.columns
# Get numerical feature importances
importances_rand = list(randmf.feature_importances_)
# List of tuples with variable and importance
feature_importances_rand = [(feature, round(importance, 4)) for feature, importance in zip(features_rand, importances_rand)]
# Sort the feature importances by most important first
feature_importances_rand = sorted(feature_importances_rand, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
for pair in feature_importances_rand:
    print('Variable: {:20} Importance: {}'.format(*pair))

In [None]:
# list of x locations for plotting
x_values = list(range(len(importances_rand)))
# Make a bar chart
plt.bar(x_values, importances_rand, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2)
# Tick labels for x axis
plt.xticks(x_values, features_rand, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');

In [None]:
# List of features sorted from most to least important
sorted_importances_rand = [importance[1] for importance in feature_importances_rand]
sorted_features_rand = [importance[0] for importance in feature_importances_rand]
# Cumulative importances
cumulative_importances_rand = np.cumsum(sorted_importances_rand)
# Make a line graph
plt.plot(x_values, cumulative_importances_rand, 'g-')
# Draw line at 90% of importance retained
plt.hlines(y = 0.9, xmin=0, xmax=len(sorted_importances_rand), color = 'r', linestyles = 'dashed')
# Format x ticks and labels
plt.xticks(x_values, sorted_features_rand, rotation = 'vertical')
# Axis labels and title
plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');

In [None]:
# Find number of features for cumulative importance of 90%
# Add 1 because Python is zero-indexed
num_rand = np.where(cumulative_importances_rand > 0.9)[0][0] + 1
print('Number of features for 90% importance:', num_rand)

In [None]:
# Extract the names of the most important features
important_feature_names_rand = [feature[0] for feature in feature_importances_rand[0:num_rand]]
print(important_feature_names_rand)

In [None]:
train_data_rand = X_train[important_feature_names_rand]
test_data_rand = X_test[important_feature_names_rand]
# Sanity check on operations
print('Important train features shape:', train_data_rand.shape)
print('Important test features shape:', test_data_rand.shape)


#### Training and Evaluating on Important Features


### Randomized Search

In [None]:
start_time = time.time()

# Train the expanded model on only the important features
randmf.fit(train_data_rand, y_train);

# Make predictions on test data
predictions_rand = randmf.predict(test_data_rand)

end_time = time.time()
reduce_rand_run_time = end_time - start_time

#RMSE
from sklearn.metrics import mean_squared_error
rmse_reduce_rand = np.sqrt(mean_squared_error(y_test, predictions_rand))
print("RMSE:", rmse_reduce_rand)
print()

#MAPE
mape_reduce_rand = np.average(np.abs((y_test - predictions_rand) / y_test))
print("MAPE:", mape_reduce_rand)
print()

AUC_reduce_rand = AUC(np.array(y_test), predictions_rand)
print("AUC test:", AUC_reduce_rand )

#### Normalization with original data

### Randomized Search

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Normalize the features using MinMaxScaler for the training dataset
scaler_all_rand = MinMaxScaler()
X_train_normalized_all_rand = scaler_all_rand.fit_transform(X_train)

# Normalize the features using the same scaler for the testing dataset
X_test_normalized_all_rand = scaler_all_rand.transform(X_test)

In [None]:
rf_regressor_all_rand = RandomForestRegressor(**rf_random.best_params_)

start_time = time.time()

rf_regressor_all_rand.fit(X_train_normalized_all_rand, y_train)

y_pred_norma_all_rand = rf_regressor_all_rand.predict(X_test_normalized_all_rand)

end_time = time.time()
normal_all_rand_run_time = end_time - start_time

from sklearn.metrics import mean_squared_error

#RMSE
from sklearn.metrics import mean_squared_error
rmse_normal_all_rand = np.sqrt(mean_squared_error(y_test, y_pred_norma_all_rand))
print("RMSE:", rmse_normal_all_rand)
print()

#MAPE
mape_normal_all_rand = np.average(np.abs((y_test - y_pred_norma_all_rand) / y_test))
print("MAPE:", mape_normal_all_rand)
print()

AUC_normal_all_rand = AUC(np.array(y_test), y_pred_norma_all_rand)
print("AUC test:", AUC_normal_all_rand )

#### Normalization with Importance Features

### Radomized Search

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Normalize the features using MinMaxScaler for the training dataset
scaler_reduce_rand = MinMaxScaler()
X_train_normalized_reduce_rand = scaler_reduce_rand.fit_transform(train_data_rand)

# Normalize the features using the same scaler for the testing dataset
X_test_normalized_reduce_rand = scaler_reduce_rand.transform(test_data_rand)

In [None]:
rf_regressor_reduce_rand = RandomForestRegressor(**rf_random.best_params_)

start_time = time.time()
rf_regressor_reduce_rand.fit(X_train_normalized_reduce_rand, y_train)

y_pred_norma_reduce_rand = rf_regressor_reduce_rand.predict(X_test_normalized_reduce_rand)

end_time = time.time()
normal_reduce_rand_run_time = end_time - start_time

from sklearn.metrics import mean_squared_error

#RMSE
from sklearn.metrics import mean_squared_error
rmse_normal_reduce_rand = np.sqrt(mean_squared_error(y_test, y_pred_norma_reduce_rand))
print("RMSE:", rmse_normal_reduce_rand)
print()

#MAPE
mape_normal_reduce_rand = np.average(np.abs((y_test - y_pred_norma_reduce_rand) / y_test))
print("MAPE:", mape_normal_reduce_rand)
print()

AUC_normal_reduce_rand = AUC(np.array(y_test), y_pred_norma_reduce_rand)
print("AUC test:", AUC_normal_reduce_rand)

## Summary statistical table

### Randomized Search

In [None]:
def print_results_rand(AUC_all_rand, rmse_all_rand, mape_all_rand, all_rand_run_time, 
                       AUC_reduce_rand, rmse_reduce_rand, mape_reduce_rand, reduce_rand_run_time, 
                  AUC_normal_all_rand, rmse_normal_all_rand, mape_normal_all_rand, normal_all_rand_run_time,
                 AUC_normal_reduce_rand, rmse_normal_reduce_rand, mape_normal_reduce_rand, normal_reduce_rand_run_time):
    headers = ['Type', 'Number of Features','Accuracy', 'RMSE', 'MAPE', 'Run Time (s)']
    all_results = [['All', len(importances_rand), AUC_all_rand, rmse_all_rand, mape_all_rand, all_rand_run_time],
                   ['Reduce', len(important_feature_names_rand), AUC_reduce_rand, rmse_reduce_rand, mape_reduce_rand, reduce_rand_run_time],
                   ['Normalization - All', len(importances_rand), AUC_normal_all_rand, rmse_normal_all_rand, mape_normal_all_rand, normal_all_rand_run_time],
                  ['Normalization - Reduce', len(important_feature_names_rand), AUC_normal_reduce_rand, rmse_normal_reduce_rand, mape_normal_reduce_rand, normal_reduce_rand_run_time]]

    # Calculate the maximum width for each column
    col_widths = [max(len(str(row[i])) for row in all_results + [headers]) for i in range(len(headers))]

    # Print table headers
    header_format = '  '.join(f"{{:<{width}}}" for width in col_widths)
    print(header_format.format(*headers))

    # Print separator row
    separator = '-' * (sum(col_widths) + 3 * (len(col_widths) - 1))
    print(separator)

    # Print table rows
    row_format = '  '.join(f"{{:<{width}}}" for width in col_widths)
    for result in all_results:
        index, num_features, AUC, rmse, mape, run_time = result
        print(row_format.format(index, num_features, AUC, rmse, mape, run_time))
        
print_results_rand(AUC_all_rand, rmse_all_rand, mape_all_rand, all_rand_run_time, 
                       AUC_reduce_rand, rmse_reduce_rand, mape_reduce_rand, reduce_rand_run_time, 
                  AUC_normal_all_rand, rmse_normal_all_rand, mape_normal_all_rand, normal_all_rand_run_time,
                 AUC_normal_reduce_rand, rmse_normal_reduce_rand, mape_normal_reduce_rand, normal_reduce_rand_run_time)


# Graph Predicted Values with Test Set

### Randomized Search

In [None]:

#hien thi ket qua du doan
fig, ax = plt.subplots(1, 1, figsize=(14, 5))
ax.plot(y_test, color = 'red', label="Bitcoin Price")
ax.plot(y_pred_rand, color = 'green', label="Predicted Bitcoin Price", linestyle="dashed")
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))  # .3f
plt.title("Random Forest Regression for Period 2")
plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(25, 5))

ax.plot(total['Date'], total['BTC_close'], color='red', label="Bitcoin Price")
ax.plot(total['Date'][-len(y_train):], y_train, color='blue', label="Training Data")
ax.plot(total['Date'][:len(y_test)], y_test, color='orange', label="Test Data")
ax.plot(total['Date'][:len(y_pred)], y_pred_rand, color='green', label="Predicted Bitcoin Price", linestyle="dashed")
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x)))) # Định dạng đường trục y
plt.legend()
plt.show()


## Comparing randomized search and grid search for hyperparameter estimation

In [None]:
import numpy as np

from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier


# Utility function to report best scores
def report(results, n_top=10):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

#### Randomized Search

In [None]:
print((len(rf_random.cv_results_["params"])))
report(rf_random.cv_results_)