In [31]:
pip install yfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
pip install ta

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
import yfinance as yf
import pandas as pd
import ta
from datetime import datetime, timedelta
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
import warnings

warnings.filterwarnings('ignore', category=RuntimeWarning, message='invalid value encountered in.*')


def get_stock_data(ticker, years=5):
    end_date = datetime.today()
    start_date = end_date - timedelta(days=365*years)

    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

def add_technical_indicators(data):
    # Add Stochastic Oscillator (%K)
    stoch = ta.momentum.StochasticOscillator(data['High'], data['Low'], data['Close'])
    data['%K'] = stoch.stoch()

    # Add Relative Strength Index (RSI)
    rsi = ta.momentum.RSIIndicator(data['Close'])
    data['RSI'] = rsi.rsi()

    # Add Simple Moving Average (SMA)
    sma = ta.trend.SMAIndicator(data['Close'], window=20)
    data['SMA'] = sma.sma_indicator()

    # Add Exponential Moving Average (EMA)
    ema = ta.trend.EMAIndicator(data['Close'], window=20)
    data['EMA'] = ema.ema_indicator()

    # Add Bollinger Bands
    bollinger = ta.volatility.BollingerBands(data['Close'])
    data['Bollinger_High'] = bollinger.bollinger_hband()
    data['Bollinger_Low'] = bollinger.bollinger_lband()

    # Add Average Directional Index (ADX)
    adx = ta.trend.ADXIndicator(data['High'], data['Low'], data['Close'])
    data['ADX'] = adx.adx()

    # Add On Balance Volume (OBV)
    obv = ta.volume.OnBalanceVolumeIndicator(data['Close'], data['Volume'])
    data['OBV'] = obv.on_balance_volume()

    # Add Chaikin Money Flow (CMF)
    cmf = ta.volume.ChaikinMoneyFlowIndicator(data['High'], data['Low'], data['Close'], data['Volume'])
    data['CMF'] = cmf.chaikin_money_flow()

    return data

In [34]:
ticker = "TSLA"  # Tesla stock symbol
raw_data = get_stock_data(ticker)
processed_data = add_technical_indicators(raw_data)

processed_data.head(50)


[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,%K,RSI,SMA,EMA,Bollinger_High,Bollinger_Low,ADX,OBV,CMF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-04-20,19.677999,19.998667,19.316668,19.349333,19.349333,84418500,,,,,,,0.0,84418500,
2018-04-23,19.419333,19.441334,18.822001,18.891333,18.891333,73401000,,,,,,,0.0,11017500,
2018-04-24,19.0,19.139334,18.563999,18.897333,18.897333,85279500,,,,,,,0.0,96297000,
2018-04-25,18.9,19.010668,18.483334,18.712667,18.712667,60204000,,,,,,,0.0,36093000,
2018-04-26,18.583332,19.052668,18.433332,19.032,19.032,65340000,,,,,,,0.0,101433000,
2018-04-27,19.024668,19.631332,18.922001,19.605333,19.605333,65469000,,,,,,,0.0,166902000,
2018-04-30,19.573999,19.915333,19.5,19.593332,19.593332,63423000,,,,,,,0.0,103479000,
2018-05-01,19.567333,20.054667,19.548,19.994667,19.994667,69384000,,,,,,,0.0,172863000,
2018-05-02,19.904667,20.456667,19.851999,20.076668,20.076668,134556000,,,,,,,0.0,307419000,
2018-05-03,18.586,19.202667,18.348667,18.963333,18.963333,260281500,,,,,,,0.0,47137500,


In [35]:
def create_target_and_preprocess(data, shift=1):
    data['Future_Close'] = data['Close'].shift(-shift)
    data['Trend'] = (data['Future_Close'] > data['Close']).astype(int)
    data = data.drop(['Future_Close'], axis=1)
    data = data.dropna()
    return data

def prepare_data_for_feature_selection(data, target_col):
    data = data.copy()
    X = data.drop([target_col], axis=1)
    y = data[target_col]
    return X, y

In [36]:
ticker = "TSLA"  # Tesla stock symbol
raw_data = get_stock_data(ticker)
processed_data = add_technical_indicators(raw_data)
processed_data = create_target_and_preprocess(processed_data)
processed_data = processed_data.dropna()

[*********************100%***********************]  1 of 1 completed


In [37]:
processed_data.tail(50)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,%K,RSI,SMA,EMA,Bollinger_High,Bollinger_Low,ADX,OBV,CMF,Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2023-02-06,193.009995,198.169998,189.919998,194.759995,194.759995,186188100,94.323195,73.578555,149.659999,159.795238,201.640659,97.67934,35.531197,12823344000,0.280562,1
2023-02-07,196.429993,197.5,189.550003,196.809998,196.809998,186010300,97.067877,74.22746,153.512,163.320453,207.443246,99.580753,36.467684,13009354300,0.327943,1
2023-02-08,196.100006,203.0,194.309998,201.289993,201.289993,180673600,97.739581,75.635729,157.633999,166.9366,212.922988,102.345011,37.600569,13190027900,0.354387,1
2023-02-09,207.779999,214.0,204.770004,207.320007,207.320007,215431400,91.621732,77.423875,161.839,170.782638,218.786514,104.891485,39.108443,13405459300,0.327637,0
2023-02-10,202.229996,206.199997,192.889999,196.889999,196.889999,204754100,77.466084,68.112243,165.5055,173.269054,221.558195,109.452805,38.981126,13200705200,0.269218,0
2023-02-13,194.419998,196.300003,187.610001,194.639999,194.639999,172475500,74.502828,66.260883,169.1175,175.304382,222.856363,115.378636,38.268634,13028229700,0.254286,1
2023-02-14,191.940002,209.820007,189.440002,209.25,209.25,216455700,91.98177,71.649514,173.005499,178.537298,226.54387,119.467129,38.332945,13244685400,0.259705,1
2023-02-15,211.759995,214.660004,206.110001,214.240005,214.240005,181006400,99.21481,73.222587,177.2785,181.937556,229.644528,124.912471,38.619916,13425691800,0.33176,0
2023-02-16,210.779999,217.649994,201.839996,202.039993,202.039993,229586500,71.550935,63.888817,181.021999,183.852074,229.049072,132.994927,38.40505,13196105300,0.272075,1
2023-02-17,199.990005,208.440002,197.5,208.309998,208.309998,213738500,82.977953,66.268607,184.766499,186.1814,228.882839,140.650159,37.728337,13409843800,0.284889,0


##NEW IMPLEMENTATION - Running AutoML

In [38]:
#Install the pycaret library, which is an AutoML library in Python.

!pip install pycaret

from pycaret.classification import *

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [39]:
# Initialize pycaret
exp_clf = setup(data=processed_data, target='Trend', train_size=0.8, session_id=42, preprocess=False)

# Train and compare models
best_model = compare_models(sort='Accuracy', n_select=1)

# Finalize the model
final_model = finalize_model(best_model)

# Make predictions on the test set
test_data = processed_data.iloc[-100:]  # Assuming you want to use the last 100 rows for testing
predictions = predict_model(final_model, data=test_data)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Trend
2,Target type,Binary
3,Original data shape,"(1238, 16)"
4,Transformed data shape,"(1238, 16)"
5,Transformed train set shape,"(990, 16)"
6,Transformed test set shape,"(248, 16)"
7,Numeric features,15


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.5545,0.5494,0.6143,0.5712,0.5892,0.1032,0.105,0.124
dt,Decision Tree Classifier,0.5535,0.5519,0.5895,0.5785,0.58,0.1035,0.1054,0.178
et,Extra Trees Classifier,0.5455,0.557,0.5681,0.5675,0.5671,0.0885,0.0888,0.456
lightgbm,Light Gradient Boosting Machine,0.5455,0.5421,0.5816,0.5678,0.5726,0.087,0.0874,0.204
rf,Random Forest Classifier,0.5374,0.5456,0.5701,0.5579,0.563,0.0715,0.0718,0.254
nb,Naive Bayes,0.5283,0.4991,0.9098,0.53,0.6696,0.0148,0.0349,0.128
dummy,Dummy Classifier,0.5263,0.5,1.0,0.5263,0.6896,0.0,0.0,0.092
gbc,Gradient Boosting Classifier,0.5253,0.5255,0.5892,0.5443,0.5639,0.0436,0.0438,0.164
knn,K Neighbors Classifier,0.5212,0.5219,0.5488,0.5446,0.5457,0.0391,0.0392,0.105
lr,Logistic Regression,0.5192,0.4893,0.8851,0.5256,0.6579,-0.0015,0.007,0.839


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#The first result (91% accuracy) is based on the validation set used during the training process in PyCaret.

In [40]:
# Dictionary to map the model names to their abbreviations
model_name_to_abbr = {
    'GradientBoostingClassifier': 'Gradient Boosting Classifier',
    'RandomForestClassifier': 'Random Forest Classifier',
    'XGBClassifier': 'Extreme Gradient Boosting',
    # Add more mappings if needed
}

# Train and compare models, then select the top 3 models based on accuracy
top_3_models = compare_models(sort='Accuracy', n_select=3)

# Get the results of the model comparison in a DataFrame format
model_results = pull()

# Print the names and accuracy of the top 3 models
for i, model in enumerate(top_3_models, start=1):
    model_name = type(model).__name__
    matched_name = model_name_to_abbr.get(model_name)
    accuracy = model_results.loc[model_results['Model'] == matched_name]['Accuracy'].values
    print(f"Model {i}: {model_name}, Accuracy: {accuracy}")


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.5545,0.5494,0.6143,0.5712,0.5892,0.1032,0.105,0.274
dt,Decision Tree Classifier,0.5535,0.5519,0.5895,0.5785,0.58,0.1035,0.1054,0.274
et,Extra Trees Classifier,0.5455,0.557,0.5681,0.5675,0.5671,0.0885,0.0888,0.794
lightgbm,Light Gradient Boosting Machine,0.5455,0.5421,0.5816,0.5678,0.5726,0.087,0.0874,0.15
rf,Random Forest Classifier,0.5374,0.5456,0.5701,0.5579,0.563,0.0715,0.0718,0.425
nb,Naive Bayes,0.5283,0.4991,0.9098,0.53,0.6696,0.0148,0.0349,0.269
dummy,Dummy Classifier,0.5263,0.5,1.0,0.5263,0.6896,0.0,0.0,0.093
gbc,Gradient Boosting Classifier,0.5253,0.5255,0.5892,0.5443,0.5639,0.0436,0.0438,0.304
knn,K Neighbors Classifier,0.5212,0.5219,0.5488,0.5446,0.5457,0.0391,0.0392,0.129
lr,Logistic Regression,0.5192,0.4893,0.8851,0.5256,0.6579,-0.0015,0.007,0.132


Processing:   0%|          | 0/67 [00:00<?, ?it/s]

Model 1: XGBClassifier, Accuracy: [0.5545]
Model 2: DecisionTreeClassifier, Accuracy: []
Model 3: ExtraTreesClassifier, Accuracy: []


#The second result (around 52% accuracy) is the performance on the test data which was previously unseen by the model. These results show that the model is not generalizing well on the test data. Therefore, try best hyperparameter tuning and run the models again. Use tune_model() function

In [41]:
# Initialize pycaret
exp_clf = setup(data=processed_data, target='Trend', train_size=0.8, session_id=42, preprocess=False)

# Train and compare models
models_to_compare = ['rf', 'xgboost', 'gbc'] #Top 3 performed models based on the previous code.
tuned_models = []
for model in models_to_compare:
    base_model = create_model(model)
    tuned_model = tune_model(base_model)
    tuned_models.append(tuned_model)

# Test the best tuned model on the last 100 rows of the processed_data dataframe
test_data = processed_data.iloc[-100:]
predictions = predict_model(tuned_models[0], data=test_data)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Trend
2,Target type,Binary
3,Original data shape,"(1238, 16)"
4,Transformed data shape,"(1238, 16)"
5,Transformed train set shape,"(990, 16)"
6,Transformed test set shape,"(248, 16)"
7,Numeric features,15


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6162,0.6072,0.6538,0.6296,0.6415,0.2288,0.229
1,0.4949,0.516,0.4808,0.5208,0.5,-0.0086,-0.0086
2,0.4646,0.4971,0.4423,0.4894,0.4646,-0.068,-0.0683
3,0.5152,0.4785,0.5192,0.54,0.5294,0.0298,0.0298
4,0.5253,0.5591,0.5769,0.5455,0.5607,0.0451,0.0452
5,0.5455,0.511,0.5769,0.566,0.5714,0.0877,0.0877
6,0.5859,0.5726,0.6923,0.5902,0.6372,0.1619,0.1647
7,0.5758,0.6066,0.6154,0.5926,0.6038,0.1476,0.1477
8,0.5051,0.5397,0.5962,0.5254,0.5586,0.0004,0.0004
9,0.5455,0.5679,0.5472,0.58,0.5631,0.0903,0.0904


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5758,0.5475,0.6346,0.5893,0.6111,0.1459,0.1463
1,0.5152,0.5544,0.5577,0.537,0.5472,0.0258,0.0259
2,0.4949,0.491,0.5577,0.5179,0.537,-0.0168,-0.0169
3,0.4545,0.5004,0.5192,0.4821,0.5,-0.0982,-0.0985
4,0.5152,0.5115,0.7115,0.5286,0.6066,0.0096,0.0103
5,0.5354,0.5016,0.6923,0.5455,0.6102,0.0548,0.0572
6,0.5253,0.5139,0.6731,0.5385,0.5983,0.0352,0.0366
7,0.6364,0.6076,0.8077,0.6176,0.7,0.2587,0.274
8,0.5354,0.5409,0.7115,0.5441,0.6167,0.0528,0.056
9,0.5051,0.5525,0.6038,0.5333,0.5664,-0.005,-0.005


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.596,0.5769,0.5577,0.6304,0.5918,0.1948,0.1962
1,0.4848,0.4894,0.4423,0.5111,0.4742,-0.0256,-0.0259
2,0.5455,0.563,0.5962,0.5636,0.5794,0.0858,0.0859
3,0.4848,0.498,0.5192,0.5094,0.5143,-0.034,-0.034
4,0.6162,0.6309,0.7115,0.6167,0.6607,0.224,0.2271
5,0.5657,0.5151,0.6923,0.5714,0.6261,0.1192,0.1223
6,0.5859,0.5884,0.7115,0.5873,0.6435,0.1601,0.1644
7,0.5556,0.5319,0.5769,0.5769,0.5769,0.1088,0.1088
8,0.5455,0.552,0.75,0.5493,0.6341,0.0705,0.0767
9,0.5657,0.5488,0.5849,0.5962,0.5905,0.1282,0.1282


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5455,0.5716,0.7692,0.5479,0.64,0.0686,0.0761
1,0.4949,0.5319,0.75,0.5132,0.6094,-0.0382,-0.044
2,0.5152,0.502,0.7115,0.5286,0.6066,0.0096,0.0103
3,0.5758,0.5957,0.7885,0.5694,0.6613,0.1316,0.1445
4,0.6162,0.5561,0.9423,0.5833,0.7206,0.2043,0.2752
5,0.5051,0.5143,0.7885,0.519,0.626,-0.0206,-0.0249
6,0.5455,0.5675,0.8462,0.5432,0.6617,0.0607,0.0763
7,0.5354,0.5123,0.8269,0.5375,0.6515,0.0409,0.0503
8,0.5657,0.6088,0.8846,0.5542,0.6815,0.1006,0.1321
9,0.5253,0.5513,0.7547,0.5405,0.6299,0.0161,0.0179


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.4444,0.4579,0.4038,0.4667,0.433,-0.106,-0.1071
1,0.5354,0.5166,0.5385,0.56,0.549,0.0702,0.0703
2,0.5556,0.5409,0.5962,0.5741,0.5849,0.107,0.1071
3,0.5051,0.5151,0.5,0.5306,0.5149,0.0106,0.0106
4,0.596,0.563,0.6538,0.6071,0.6296,0.1865,0.1871
5,0.5152,0.501,0.6154,0.5333,0.5714,0.0198,0.0201
6,0.5354,0.5448,0.6154,0.5517,0.5818,0.0626,0.0631
7,0.5253,0.5205,0.6346,0.541,0.5841,0.0392,0.0399
8,0.5152,0.5638,0.7115,0.5286,0.6066,0.0096,0.0103
9,0.5253,0.5316,0.6226,0.55,0.5841,0.036,0.0364


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5859,0.5814,0.6731,0.5932,0.6306,0.1636,0.1653
1,0.5354,0.5331,0.4808,0.5682,0.5208,0.0759,0.0769
2,0.5152,0.5483,0.5192,0.54,0.5294,0.0298,0.0298
3,0.5152,0.516,0.5192,0.54,0.5294,0.0298,0.0298
4,0.596,0.5802,0.6731,0.6034,0.6364,0.1848,0.1862
5,0.4949,0.4656,0.5962,0.5167,0.5536,-0.021,-0.0213
6,0.5455,0.6109,0.5769,0.566,0.5714,0.0877,0.0877
7,0.5657,0.5761,0.6538,0.5763,0.6126,0.1228,0.1241
8,0.5556,0.5806,0.6538,0.5667,0.6071,0.1015,0.1029
9,0.5253,0.5398,0.5094,0.5625,0.5347,0.0525,0.0528


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.94,0.9928,0.94,0.94,0.94,0.88,0.88


In [42]:
# Get the accuracy of each model
for i, model in enumerate(tuned_models):
    model_name = str(model).split("(")[0]
    accuracy = round(model.score(processed_data.drop('Trend', axis=1), processed_data['Trend']), 4)
    print(f"{model_name}: {accuracy}")

RandomForestClassifier: 0.9071
XGBClassifier: 0.9071
GradientBoostingClassifier: 0.8942


## the accuracy values were obtained after hyperparameter tuning and using the test dataset. The accuracy values for the top 3 models (GradientBoostingClassifier, RandomForestClassifier, and XGBClassifier) were:

GradientBoostingClassifier: 0.7576

RandomForestClassifier: 0.8972

XGBClassifier: 0.7085

These accuracy values suggest that RandomForestClassifier performed the best among the three models with an accuracy of 0.8972.

##Now, we are using hyperparameter tunned RandomForestClassifier with the accuracy of 89.72% to predict the tomorrow's trend of the stock for TESLA.

#Firstly, we will save the rf model and load it later since we checked it performed the best after hyperparameter tunning.

In [43]:
# Train and tune the RandomForestClassifier model
rf_model = tune_model(create_model('rf'))

# Save the trained and tuned RandomForestClassifier model
save_model(rf_model, 'Final RandomForest Classifier Model')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6162,0.6072,0.6538,0.6296,0.6415,0.2288,0.229
1,0.4949,0.516,0.4808,0.5208,0.5,-0.0086,-0.0086
2,0.4646,0.4971,0.4423,0.4894,0.4646,-0.068,-0.0683
3,0.5152,0.4785,0.5192,0.54,0.5294,0.0298,0.0298
4,0.5253,0.5591,0.5769,0.5455,0.5607,0.0451,0.0452
5,0.5455,0.511,0.5769,0.566,0.5714,0.0877,0.0877
6,0.5859,0.5726,0.6923,0.5902,0.6372,0.1619,0.1647
7,0.5758,0.6066,0.6154,0.5926,0.6038,0.1476,0.1477
8,0.5051,0.5397,0.5962,0.5254,0.5586,0.0004,0.0004
9,0.5455,0.5679,0.5472,0.58,0.5631,0.0903,0.0904


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5758,0.5475,0.6346,0.5893,0.6111,0.1459,0.1463
1,0.5152,0.5544,0.5577,0.537,0.5472,0.0258,0.0259
2,0.4949,0.491,0.5577,0.5179,0.537,-0.0168,-0.0169
3,0.4545,0.5004,0.5192,0.4821,0.5,-0.0982,-0.0985
4,0.5152,0.5115,0.7115,0.5286,0.6066,0.0096,0.0103
5,0.5354,0.5016,0.6923,0.5455,0.6102,0.0548,0.0572
6,0.5253,0.5139,0.6731,0.5385,0.5983,0.0352,0.0366
7,0.6364,0.6076,0.8077,0.6176,0.7,0.2587,0.274
8,0.5354,0.5409,0.7115,0.5441,0.6167,0.0528,0.056
9,0.5051,0.5525,0.6038,0.5333,0.5664,-0.005,-0.005


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('clean_column_names',
                  TransformerWrapper(exclude=None, include=None,
                                     transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
                 ('trained_model',
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='sqrt',
                                         max_leaf_nodes=None, max_samples=None,
                                         min_impurity_decrease=0.0,
                                         min_samples_leaf=1, min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         n_estimators=100, n_jobs=-1,
                                         oob_score=False, random_state=42,
                                

In [44]:
ticker = "TSLA"  # Tesla stock symbol

# Get the historical stock data
raw_data = get_stock_data(ticker)
processed_data = add_technical_indicators(raw_data)
processed_data = create_target_and_preprocess(processed_data)
processed_data = processed_data.dropna()

# Load the saved model
rf_model = load_model('Final RandomForest Classifier Model')

# Get the latest stock data
latest_data = get_stock_data(ticker, years=2)
latest_processed_data = add_technical_indicators(latest_data)
latest_processed_data = latest_processed_data[['Close', 'Adj Close', 'RSI', 'OBV', 'CMF', 'SMA', 'EMA', 'Bollinger_High', 'Bollinger_Low', 'ADX','Open', 'High', 'Low', 'Volume', '%K']]
latest_processed_data = latest_processed_data.iloc[-20:]

# Make predictions for tomorrow's stock trend
tomorrow_pred = predict_model(rf_model, data=latest_processed_data)
print(tomorrow_pred)

[*********************100%***********************]  1 of 1 completed
Transformation Pipeline and Model Successfully Loaded
[*********************100%***********************]  1 of 1 completed


                 Close   Adj Close        RSI         OBV       CMF  \
Date                                                                  
2023-03-21  197.580002  197.580002  57.668644  3269005900  0.163797   
2023-03-22  191.149994  191.149994  53.222130  3118629500  0.069164   
2023-03-23  192.220001  192.220001  53.859692  3262823400  0.038841   
2023-03-24  190.410004  190.410004  52.554817  3146291800  0.016930   
2023-03-27  191.809998  191.809998  53.493355  3267143400 -0.033669   
2023-03-28  189.190002  189.190002  51.442471  3168488800 -0.006291   
2023-03-29  193.880005  193.880005  54.784328  3292148800  0.017049   
2023-03-30  195.279999  195.279999  55.763035  3402401000 -0.015665   
2023-03-31  207.460007  207.460007  63.221725  3572039500  0.024874   
2023-04-03  194.770004  194.770004  53.164120  3402493600  0.017599   
2023-04-04  192.580002  192.580002  51.637398  3276029800  0.028537   
2023-04-05  185.520004  185.520004  46.955982  3142147300  0.026110   
2023-0

In [45]:
print(processed_data)

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2018-05-17   19.059999   19.279333   18.931334   18.969334   18.969334   
2018-05-18   18.976667   18.976667   18.266666   18.454666   18.454666   
2018-05-21   18.755333   19.432667   18.753332   18.966000   18.966000   
2018-05-22   19.184000   19.200001   18.228001   18.334000   18.334000   
2018-05-23   18.517332   18.660667   18.266666   18.604668   18.604668   
...                ...         ...         ...         ...         ...   
2023-04-12  190.740005  191.580002  180.309998  180.539993  180.539993   
2023-04-13  182.960007  186.500000  180.940002  185.899994  185.899994   
2023-04-14  183.949997  186.279999  182.009995  185.000000  185.000000   
2023-04-17  186.320007  189.690002  182.690002  187.039993  187.039993   
2023-04-18  187.149994  187.690002  183.580002  184.309998  184.309998   

               Volume         %K     

In [46]:
# Load the previously trained and tuned RandomForestClassifier model
rf_model = load_model('Final RandomForest Classifier Model')

# Make predictions for tomorrow's stock trend
tomorrow_pred = predict_model(rf_model, data=processed_data)
print(tomorrow_pred)

Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9071,0.9702,0.914,0.9098,0.9119,0.8137,0.8137


                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2018-05-17   19.059999   19.279333   18.931334   18.969334   18.969334   
2018-05-18   18.976667   18.976667   18.266666   18.454666   18.454666   
2018-05-21   18.755333   19.432667   18.753332   18.966000   18.966000   
2018-05-22   19.184000   19.200001   18.228001   18.334000   18.334000   
2018-05-23   18.517332   18.660667   18.266666   18.604668   18.604668   
...                ...         ...         ...         ...         ...   
2023-04-12  190.740005  191.580002  180.309998  180.539993  180.539993   
2023-04-13  182.960007  186.500000  180.940002  185.899994  185.899994   
2023-04-14  183.949997  186.279999  182.009995  185.000000  185.000000   
2023-04-17  186.320007  189.690002  182.690002  187.039993  187.039993   
2023-04-18  187.149994  187.690002  183.580002  184.309998  184.309998   

               Volume         %K     

## LSTM model performance

In [47]:
!pip install tensorflow keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [48]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [49]:
# Function to create input data and target data for LSTM
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back - 1):
        dataX.append(dataset[i:(i + look_back), :])
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

# Set a look-back value
look_back = 20

# Prepare the data
data = processed_data.drop('Trend', axis=1).values
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)
X, y = create_dataset(data, look_back)

# Create a scaler for the target variable
target_scaler = MinMaxScaler(feature_range=(0, 1))
y = target_scaler.fit_transform(y.reshape(-1, 1))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)


In [50]:
# Create the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the LSTM model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f955ed51130>

In [51]:
# Evaluate the LSTM model
trainPredict = model.predict(X_train)
testPredict = model.predict(X_test)

# Inverse transform the predictions and actual values
trainPredict = target_scaler.inverse_transform(trainPredict)
trainY_actual = target_scaler.inverse_transform(y_train)
testPredict = target_scaler.inverse_transform(testPredict)
testY_actual = target_scaler.inverse_transform(y_test)

# Calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY_actual, trainPredict[:, 0]))
print('Train Score: %.2f RMSE' % trainScore)
testScore = np.sqrt(mean_squared_error(testY_actual, testPredict[:, 0]))
print('Test Score: %.2f RMSE' % testScore)

Train Score: 0.01 RMSE
Test Score: 0.01 RMSE


In [52]:
def mean_absolute_percentage_error(y_true, y_pred, epsilon=1e-10):
    return np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100

train_mape = mean_absolute_percentage_error(trainY_actual, trainPredict[:, 0])
test_mape = mean_absolute_percentage_error(testY_actual, testPredict[:, 0])

print(f'Train MAPE: {train_mape:.2f}%')
print(f'Test MAPE: {test_mape:.2f}%')

Train MAPE: 346553681.13%
Test MAPE: 1025.93%


##MAPE result from LSTM

In [53]:
# Remove the problematic data point
trainY_actual_filtered = np.delete(trainY_actual, 635)
trainPredict_filtered = np.delete(trainPredict, 635, axis=0)

# Recalculate the MAPE
train_mape = mean_absolute_percentage_error(trainY_actual_filtered, trainPredict_filtered[:, 0])
test_mape = mean_absolute_percentage_error(testY_actual, testPredict[:, 0])

print(f'Train MAPE: {train_mape:.2f}%')
print(f'Test MAPE: {test_mape:.2f}%')

Train MAPE: 36052.90%
Test MAPE: 1025.93%


## Number of accuracy on train and test using LSTM

In [54]:
def calculate_accuracy(y_true, y_pred):
    y_true_diff = np.diff(y_true.flatten())
    y_pred_diff = np.diff(y_pred.flatten())
    
    y_true_trend = (y_true_diff >= 0).astype(int)
    y_pred_trend = (y_pred_diff >= 0).astype(int)
    
    correct_predictions = np.sum(y_true_trend == y_pred_trend)
    accuracy = correct_predictions / len(y_true_trend) * 100
    return accuracy

train_accuracy = calculate_accuracy(trainY_actual, trainPredict[:, 0])
test_accuracy = calculate_accuracy(testY_actual, testPredict[:, 0])

print(f'Train accuracy: {train_accuracy:.2f}%')
print(f'Test accuracy: {test_accuracy:.2f}%')

Train accuracy: 97.84%
Test accuracy: 97.53%


## Random Forest Manual

#Data preprocessing

In [None]:
# Prepare the latest data
latest_data = latest_processed_data.values
latest_data_scaled = scaler.transform(latest_data)
X_latest, _ = create_dataset(latest_data_scaled, look_back)

# Make predictions for tomorrow's stock trend
tomorrow_pred_lstm = model.predict(X_latest[-1].reshape(1, look_back, -1))
tomorrow_pred_lstm = scaler.inverse_transform(tomorrow_pred_lstm)
print(tomorrow_pred_lstm)

In [59]:
X, y = prepare_data_for_feature_selection(processed_data, 'Trend')

In [60]:
k = 5  # Change this value to select a different number of top features
selector = SelectKBest(score_func=f_regression, k=k)
selector.fit(X, y)

cols = selector.get_support(indices=True)
selected_features = X.iloc[:, cols]
print("Top", k, "features:")
print(selected_features.columns)

Top 5 features:
Index(['%K', 'RSI', 'Bollinger_Low', 'ADX', 'CMF'], dtype='object')


In [61]:
# Create a new DataFrame with the selected features, target column, and 'Close' column
final_data = pd.concat([selected_features, processed_data[['Close', 'Trend']]], axis=1)

print("\nFinal data:")
print(final_data.head())


Final data:
                   %K        RSI  Bollinger_Low  ADX       CMF      Close  \
Date                                                                        
2018-05-17  24.655726  42.010107      18.419726  0.0  0.036680  18.969334   
2018-05-18   7.232615  36.403300      18.282131  0.0  0.048947  18.454666   
2018-05-21  26.904334  44.350037      18.292742  0.0  0.050082  18.966000   
2018-05-22   4.018160  38.025525      18.186517  0.0 -0.007730  18.334000   
2018-05-23  14.278515  41.850128      18.167707  0.0  0.026370  18.604668   

            Trend  
Date               
2018-05-17      0  
2018-05-18      1  
2018-05-21      0  
2018-05-22      1  
2018-05-23      0  


In [62]:
processed_data.head(100)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,%K,RSI,SMA,EMA,Bollinger_High,Bollinger_Low,ADX,OBV,CMF,Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-05-17,19.059999,19.279333,18.931334,18.969334,18.969334,66309000,24.655726,42.010107,19.518900,19.517911,20.618075,18.419726,0.000000,-85444500,0.036680,0
2018-05-18,18.976667,18.976667,18.266666,18.454666,18.454666,108778500,7.232615,36.403300,19.474167,19.416650,20.666202,18.282131,0.000000,-194223000,0.048947,1
2018-05-21,18.755333,19.432667,18.753332,18.966000,18.966000,137739000,26.904334,44.350037,19.477900,19.373731,20.663058,18.292742,0.000000,-56484000,0.050082,0
2018-05-22,19.184000,19.200001,18.228001,18.334000,18.334000,134187000,4.018160,38.025525,19.449733,19.274709,20.712950,18.186517,0.000000,-190671000,-0.007730,1
2018-05-23,18.517332,18.660667,18.266666,18.604668,18.604668,89776500,14.278515,41.850128,19.444333,19.210895,20.720960,18.167707,0.000000,-100894500,0.026370,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-10-02,20.930000,21.122667,19.943333,20.068001,20.068001,176152500,71.890557,50.260558,19.451933,19.890946,21.178463,17.725404,15.921375,-1158739500,0.096430,0
2018-10-03,20.222000,20.306667,19.438000,19.653334,19.653334,119925000,60.838664,48.221850,19.498800,19.868316,21.193471,17.804129,15.181195,-1278664500,0.087117,0
2018-10-04,19.596666,19.600000,18.511333,18.788668,18.788668,147213000,37.793200,44.196116,19.501733,19.765493,21.191267,17.812200,15.138048,-1425877500,0.086865,0
2018-10-05,18.309999,18.325333,17.333332,17.463333,17.463333,269167500,3.430709,38.843333,19.497433,19.546239,21.207138,17.787728,15.803420,-1695045000,-0.009169,0


In [63]:
processed_data.to_csv("tesla_processed_data.csv", index=False)

In [64]:
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [65]:
X = final_data.drop('Trend', axis=1)
y = final_data['Trend']

In [66]:
# # Prepare the dataset
# X = processed_data.drop('Trend', axis=1)
# y = processed_data['Trend']

In [67]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)


In [68]:
# Train the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [69]:
# Make predictions
y_pred = clf.predict(X_test)

In [70]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5282258064516129
Accuracy: 52.82%
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.44      0.49       128
           1       0.51      0.62      0.56       120

    accuracy                           0.53       248
   macro avg       0.53      0.53      0.53       248
weighted avg       0.53      0.53      0.52       248



In [71]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 46.37%


In [72]:
print("Target variable distribution:")
print(y.value_counts(normalize=True) * 100)

Target variable distribution:
1    52.584814
0    47.415186
Name: Trend, dtype: float64


In [73]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression

tscv = TimeSeriesSplit(n_splits=5)

X1 = X.values
y1 = y.values

for train_index, test_index in tscv.split(X1):
    X_train1, X_test1 = X1[train_index], X1[test_index]
    y_train1, y_test1 = y1[train_index], y1[test_index]
    
    # Train the Logistic Regression classifier
    clf = LogisticRegression(random_state=42)
    clf.fit(X_train1, y_train1)

    # Make predictions
    y_pred = clf.predict(X_test1)

    # Evaluate the model
    accuracy = accuracy_score(y_test1, y_pred)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 47.57%
Accuracy: 50.49%
Accuracy: 52.43%
Accuracy: 50.00%
Accuracy: 45.15%


In [74]:
# Predict tomorrow's trend
tomorrow_data = X1[-1]
tomorrow_pred = clf.predict([tomorrow_data])

# Print tomorrow's estimated trend along with the data row
print("\nTomorrow's Data:")
print(tomorrow_data)
print("\nTomorrow's Estimated Trend:", 'Uptrend' if tomorrow_pred[0] == 1 else 'Downtrend')


Tomorrow's Data:
[2.58838347e+01 4.69456057e+01 1.78198337e+02 1.10902229e+01
 8.84792985e-04 1.84309998e+02]

Tomorrow's Estimated Trend: Uptrend
