Name: Hung Lam , Phung Ly   
* Email: noticslam@pm.me


In [40]:
import pandas as pd
import talib


In [41]:
# Load data
data = pd.read_csv('BTC-USD.csv')

# Add RSI(14) feature
rsi_14 = talib.RSI(data['Close'], timeperiod=14)
data['RSI_14'] = rsi_14

# Add WT(10) and WT(11) features
wt_10 = talib.WILLR(data['High'], data['Low'], data['Close'], timeperiod=10)
data['WT_10'] = wt_10

wt_11 = talib.WILLR(data['High'], data['Low'], data['Close'], timeperiod=11)
data['WT_11'] = wt_11

# Add CCI(20) and CCI(1) features
cci_20 = talib.CCI(data['High'], data['Low'], data['Close'], timeperiod=20)
data['CCI_20'] = cci_20

cci_2 = talib.CCI(data['High'], data['Low'], data['Close'], timeperiod=2)
data['CCI_2'] = cci_2

# Add ADX(20) and ADX(2) features
adx_20 = talib.ADX(data['High'], data['Low'], data['Close'], timeperiod=20)
data['ADX_20'] = adx_20

adx_2 = talib.ADX(data['High'], data['Low'], data['Close'], timeperiod=2)
data['ADX_2'] = adx_2

# Add RSI(9) and RSI(1) features
rsi_9 = talib.RSI(data['Close'], timeperiod=9)
data['RSI_9'] = rsi_9

rsi_2 = talib.RSI(data['Close'], timeperiod=2)
data['RSI_2'] = rsi_2

In [42]:
# Calculate differences
close_open_diff = data['Close'] - data['Open']
high_low_diff = data['High'] - data['Low']

# Add Candlestick Shape column
data.loc[(close_open_diff > 0) & (high_low_diff > 0), 'Candlestick Shape'] = 'Bullish'
data.loc[(close_open_diff < 0) & (high_low_diff > 0), 'Candlestick Shape'] = 'Bearish'
data.loc[high_low_diff == 0, 'Candlestick Shape'] = 'Neutral'

# Calculate candlestick patterns
doji = talib.CDLDOJI(data['Open'], data['High'], data['Low'], data['Close'])
engulfing = talib.CDLENGULFING(data['Open'], data['High'], data['Low'], data['Close'])
hammer = talib.CDLHAMMER(data['Open'], data['High'], data['Low'], data['Close'])
hanging_man = talib.CDLHANGINGMAN(data['Open'], data['High'], data['Low'], data['Close'])

# Add Candlestick Pattern column
data.loc[doji > 0, 'Candlestick Pattern'] = 'Doji'
data.loc[engulfing > 0, 'Candlestick Pattern'] = 'Engulfing'
data.loc[hammer > 0, 'Candlestick Pattern'] = 'Hammer'
data.loc[hanging_man > 0, 'Candlestick Pattern'] = 'Hanging Man'
data.loc[(doji == 0) & (engulfing == 0) & (hammer == 0) & (hanging_man == 0), 'Candlestick Pattern'] = 'None'

# Refine labels using Candlestick Shape and Candlestick Pattern
data.loc[(data['Candlestick Shape'] == 'Bullish') & (data['Candlestick Pattern'] == 'Doji'), 'Direction'] = 0
data.loc[(data['Candlestick Shape'] == 'Bearish') & (data['Candlestick Pattern'] == 'Doji'), 'Direction'] = 0
data.loc[(data['Candlestick Shape'] == 'Bullish') & (data['Candlestick Pattern'] == 'Engulfing'), 'Direction'] = 1
data.loc[(data['Candlestick Shape'] == 'Bearish') & (data['Candlestick Pattern'] == 'Engulfing'), 'Direction'] = -1
data.loc[(data['Candlestick Shape'] == 'Bullish') & (data['Candlestick Pattern'] == 'Hammer'), 'Direction'] = 1
data.loc[(data['Candlestick Shape'] == 'Bearish') & (data['Candlestick Pattern'] == 'Hanging Man'), 'Direction'] = 1

# Add Price Change column
data['Price Change'] = data['Close'].diff()

# Identify primary trend
data['200 MA'] = talib.SMA(data['Close'], timeperiod=200)
data.loc[data['Close'] > data['200 MA'], 'Primary Trend'] = 'Bullish'
data.loc[data['Close'] < data['200 MA'], 'Primary Trend'] = 'Bearish'
# Confirm/reject trend with momentum indicators
data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
data.loc[(data['Primary Trend'] == 'Bullish') & (data['RSI'] > 50), 'Direction'] = 1
data.loc[(data['Primary Trend'] == 'Bearish') & (data['RSI'] < 50), 'Direction'] = -1
# Confirm/reject trend with other indicators

data.loc[(data['Direction'] == 'up') & (doji > 0), 'Direction'] = 0
# Remove rows with NaN values
data = data.dropna()
# Move "Direction" column to the end
direction_col = data.pop('Direction')
data.insert(len(data.columns), 'Direction', direction_col)
data = data.reset_index(drop=True)

# Add Trend Change column
data['Target'] = 0

for i in range(len(data) - 3):
    if data.loc[i, 'Direction'] == -1 and data.loc[i+1:i+3, 'Direction'].sum() == -3:
        data.loc[i+3, 'Target'] = -1  # Change to downtrend
    elif data.loc[i, 'Direction'] == 1 and data.loc[i+1:i+3, 'Direction'].sum() == 3:
        data.loc[i+3, 'Target'] = 1  # Change to uptrend




# Save data
data.to_csv('btc_with_features.csv', index=False)

print(data.head(10))

         Date        Open        High         Low       Close   Adj Close  \
0  2015-04-04  254.291000  255.257996  251.100006  253.697006  253.697006   
1  2015-04-06  260.721008  261.798004  254.574997  255.492004  255.492004   
2  2015-04-07  255.274002  255.804993  252.205002  253.179993  253.179993   
3  2015-04-08  253.063995  253.847000  244.214996  245.022003  245.022003   
4  2015-04-09  244.751007  246.117996  239.399994  243.675995  243.675995   
5  2015-04-10  243.694000  243.694000  232.770996  236.072006  236.072006   
6  2015-04-11  236.016006  239.537003  234.175003  236.552002  236.552002   
7  2015-04-12  236.535004  237.727997  233.494995  236.153000  236.153000   
8  2015-04-13  235.949997  236.934998  221.996002  224.587006  224.587006   
9  2015-04-14  224.759003  224.975998  216.322998  219.158997  219.158997   

       Volume     RSI_14      WT_10      WT_11  ...      RSI_9      RSI_2  \
0  12493500.0  47.508300 -17.696193 -15.342915  ...  48.624067  72.670393  

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

Imagine we have a dataset that contains information about the price of a cryptocurrency, in this case, Bitcoin. We want to predict whether the price will go up, down, or stay the same. To do that, we need to analyze different features in the dataset.

First, we load the dataset and select the features that we think are important for prediction, such as the closing price, candlestick shape, candlestick pattern, 200-day moving average, primary trend, RSI (Relative Strength Index), and direction.

Since some of these features are categorical (e.g., candlestick shape and pattern), we encode them into numerical values so that our model can understand them.

Next, we scale the numerical features (closing price, 200-day moving average, and RSI) using a process called standardization. This ensures that all features are on a similar scale, making it easier for the model to learn.

We then split the data into training and testing sets. The training set is used to train our model, while the testing set is used to evaluate its performance.

Now comes the exciting part. We create a Gradient Boosting model, which is a machine learning algorithm known for its ability to make accurate predictions. We want to find the best configuration for this model, so we use a technique called grid search to explore different combinations of hyperparameters (settings) and select the ones that give us the best results.

Once we have the best model, we make predictions on the testing set and even predict the target for tomorrow based on the features of the last day in the dataset.

We evaluate the performance of our model by comparing the predicted values with the true target values in the testing set. This gives us metrics like precision, recall, and F1-score, which tell us how well our model is performing.

Finally, we save our best model and export the testing set with predictions and true target values to a CSV file for further analysis if needed.

In summary, this script shows how we preprocess and analyze data using machine learning techniques to predict the future price movement of a cryptocurrency.






In [45]:
# Load data
data = pd.read_csv('btc_with_features.csv')

# Define features and target
X = data[['Close', 'Candlestick Shape', 'Candlestick Pattern', '200 MA', 'Primary Trend', 'RSI','Direction']]
y = data['Target']

# Encode categorical features
X = pd.get_dummies(X, columns=['Candlestick Shape', 'Candlestick Pattern', 'Primary Trend'])

# Scale numerical features
scaler = StandardScaler()
X[['Close', '200 MA', 'RSI']] = scaler.fit_transform(X[['Close', '200 MA', 'RSI']])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Define hyperparameter search space
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(gb_model, param_grid, scoring='accuracy', cv=25, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Display the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)

# Use the best model for predictions
best_model = grid_search.best_estimator_

# Make predictions on testing set
y_pred = best_model.predict(X_test)

# Get the features for the last day in your dataset
last_day_features = X.iloc[-1].values.reshape(1, -1)

# Predict the target for tomorrow
predicted_target = best_model.predict(last_day_features)[0]

# Print the predicted target
print('The predicted target for tomorrow is:', predicted_target)

# Add predictions and true target values to the testing set
X_test['Prediction'] = y_pred
X_test['True_Target'] = y_test

# Export testing set with predictions and true target values to a CSV file
X_test.to_csv('gb_predictions_with_true_targets_tuned.csv', index=False)


# Print classification report
print(classification_report(y_test, y_pred))

import joblib

# Save the best model
joblib.dump(best_model, 'best_gb_model.pkl')

Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
The predicted target for tomorrow is: 0
              precision    recall  f1-score   support

          -1       0.85      0.96      0.90       129
           0       0.77      0.51      0.62        72
           1       0.95      0.97      0.96       234

    accuracy                           0.89       435
   macro avg       0.86      0.82      0.83       435
weighted avg       0.89      0.89      0.89       435





['best_gb_model.pkl']