In [None]:
pip install yfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install ta

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ta
  Downloading ta-0.10.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.10.2-py3-none-any.whl size=29103 sha256=164ddb8c954fa27dbc64285f22b84e44a3f862e2159dbedb7da59a6f45c7daae
  Stored in directory: /root/.cache/pip/wheels/6a/76/03/7f785aaa50b9c6ec7e3fd105a62c1b2c45a034512d51e024a9
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.10.2


In [None]:
import yfinance as yf
import pandas as pd
import ta
from datetime import datetime, timedelta
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
import warnings

warnings.filterwarnings('ignore', category=RuntimeWarning, message='invalid value encountered in.*')


def get_stock_data(ticker, years=6):
    end_date = datetime.today()
    start_date = end_date - timedelta(days=365*years)

    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

def add_technical_indicators(data):
    # Add Stochastic Oscillator (%K)
    stoch = ta.momentum.StochasticOscillator(data['High'], data['Low'], data['Close'])
    data['%K'] = stoch.stoch()

    # Add Relative Strength Index (RSI)
    rsi = ta.momentum.RSIIndicator(data['Close'])
    data['RSI'] = rsi.rsi()

    # Add Simple Moving Average (SMA)
    sma = ta.trend.SMAIndicator(data['Close'], window=20)
    data['SMA'] = sma.sma_indicator()

    # Add Exponential Moving Average (EMA)
    ema = ta.trend.EMAIndicator(data['Close'], window=20)
    data['EMA'] = ema.ema_indicator()

    # Add Bollinger Bands
    bollinger = ta.volatility.BollingerBands(data['Close'])
    data['Bollinger_High'] = bollinger.bollinger_hband()
    data['Bollinger_Low'] = bollinger.bollinger_lband(

    # Add Average Directional Index (ADX)
    adx = ta.trend.ADXIndicator(data['High'], data['Low'], data['Close'])
    data['ADX'] = adx.adx()

    # Add On Balance Volume (OBV)
    obv = ta.volume.OnBalanceVolumeIndicator(data['Close'], data['Volume'])
    data['OBV'] = obv.on_balance_volume()

    # Add Chaikin Money Flow (CMF)
    cmf = ta.volume.ChaikinMoneyFlowIndicator(data['High'], data['Low'], data['Close'], data['Volume'])
    data['CMF'] = cmf.chaikin_money_flow()

    return data

In [None]:
ticker = "TSLA"  # Tesla stock symbol
raw_data = get_stock_data(ticker)
processed_data = add_technical_indicators(raw_data)

processed_data.head(50)


[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,%K,RSI,OBV,CMF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-04-11,20.892,20.898001,20.366667,20.580667,20.580667,85869000,,,85869000,
2017-04-12,20.422667,20.563334,19.754667,19.789333,19.789333,90760500,,,-4891500,
2017-04-13,19.780001,20.492666,19.686666,20.266666,20.266666,139269000,,,134377500,
2017-04-17,20.18,20.266666,19.912001,20.096001,20.096001,62080500,,,72297000,
2017-04-18,19.98,20.056,19.860001,20.016666,20.016666,45535500,,,26761500,
2017-04-19,20.164,20.441334,20.140667,20.368,20.368,58470000,,,85231500,
2017-04-20,20.434,20.610001,20.015333,20.167334,20.167334,92241000,,,-7009500,
2017-04-21,20.133333,20.426666,20.028,20.373333,20.373333,67647000,,,60637500,
2017-04-24,20.614668,20.703333,20.401333,20.535334,20.535334,76252500,,,136890000,
2017-04-25,20.533333,20.931999,20.390667,20.919333,20.919333,101065500,,,237955500,


In [None]:
def create_target_and_preprocess(data, shift=1):
    data['Future_Close'] = data['Close'].shift(-shift)
    data['Trend'] = (data['Future_Close'] > data['Close']).astype(int)
    data = data.drop(['Future_Close'], axis=1)
    data = data.dropna()
    return data

def prepare_data_for_feature_selection(data, target_col):
    data = data.copy()
    X = data.drop([target_col], axis=1)
    y = data[target_col]
    return X, y

In [None]:
ticker = "TSLA"  # Tesla stock symbol
raw_data = get_stock_data(ticker)
processed_data = add_technical_indicators(raw_data)
processed_data = create_target_and_preprocess(processed_data)
processed_data = processed_data.dropna()

[*********************100%***********************]  1 of 1 completed


In [None]:
processed_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,%K,RSI,SMA,EMA,Bollinger_High,Bollinger_Low,ADX,OBV,CMF,Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2017-05-09,20.625334,21.466,20.606667,21.417334,21.417334,145147500,82.655837,58.706452,20.548533,20.65566,21.510998,19.586068,0.0,78214500,0.038443,1
2017-05-10,21.437332,21.700001,21.208,21.681334,21.681334,86124000,93.387545,60.977059,20.603567,20.753343,21.685539,19.521594,0.0,164338500,0.088723,0
2017-05-11,21.559999,21.733334,21.306667,21.540001,21.540001,71307000,87.642315,59.103379,20.6911,20.828263,21.778668,19.603532,0.0,93031500,0.13694,1
2017-05-12,21.698668,21.799999,21.435333,21.653999,21.653999,61824000,92.276398,60.166585,20.760467,20.906905,21.906312,19.614621,0.0,154855500,0.115903,0
2017-05-15,21.225332,21.346666,20.835333,21.058666,21.058666,114330000,68.075857,52.491781,20.8086,20.921358,21.919085,19.698115,0.0,40525500,0.103686,1


##NEW IMPLEMENTATION - Running AutoML

In [None]:
#Install the pycaret library, which is an AutoML library in Python.

!pip install pycaret

from pycaret.classification import *

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.0-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 KB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting wurlitzer
  Downloading wurlitzer-3.0.3-py3-none-any.whl (7.3 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 KB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
Collecting sktime>=0.16.1
  Downloading sktime-0.17.0-py3-none-any.whl (16.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
Collecting pmdarima!=1.8.1,<3.0.0,>=1.8.0
  Downloading pmdarima-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.

In [None]:
# Initialize pycaret
exp_clf = setup(data=processed_data, target='Trend', train_size=0.8, session_id=42, preprocess=False)

# Train and compare models
best_model = compare_models(sort='Accuracy', n_select=1)

# Finalize the model
final_model = finalize_model(best_model)

# Make predictions on the test set
test_data = processed_data.iloc[-100:]  # Assuming you want to use the last 100 rows for testing
predictions = predict_model(final_model, data=test_data)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Trend
2,Target type,Binary
3,Original data shape,"(1489, 16)"
4,Transformed data shape,"(1489, 16)"
5,Transformed train set shape,"(1191, 16)"
6,Transformed test set shape,"(298, 16)"
7,Numeric features,15


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5256,0.5334,0.5952,0.5403,0.566,0.0454,0.0457,0.124
rf,Random Forest Classifier,0.5248,0.5218,0.5435,0.5419,0.5417,0.0481,0.0484,0.222
xgboost,Extreme Gradient Boosting,0.5239,0.5207,0.5323,0.542,0.5363,0.0472,0.0472,0.058
dummy,Dummy Classifier,0.5206,0.5,1.0,0.5206,0.6847,0.0,0.0,0.085
knn,K Neighbors Classifier,0.5197,0.5383,0.5403,0.5385,0.538,0.0377,0.0379,0.066
lightgbm,Light Gradient Boosting Machine,0.5173,0.5256,0.5387,0.5369,0.5367,0.0328,0.0328,0.105
lr,Logistic Regression,0.5172,0.4908,0.9823,0.5191,0.679,-0.0057,-0.0172,0.064
ridge,Ridge Classifier,0.5122,0.0,0.6113,0.5282,0.5656,0.0158,0.0156,0.071
nb,Naive Bayes,0.5113,0.4679,0.95,0.5163,0.6687,-0.0155,-0.0176,0.087
lda,Linear Discriminant Analysis,0.5113,0.5042,0.6097,0.5276,0.5646,0.0142,0.0139,0.054


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.91,0.9726,0.9796,0.8571,0.9143,0.8204,0.8286


#The first result (91% accuracy) is based on the validation set used during the training process in PyCaret.

In [None]:
# Dictionary to map the model names to their abbreviations
model_name_to_abbr = {
    'GradientBoostingClassifier': 'Gradient Boosting Classifier',
    'RandomForestClassifier': 'Random Forest Classifier',
    'XGBClassifier': 'Extreme Gradient Boosting',
    # Add more mappings if needed
}

# Train and compare models, then select the top 3 models based on accuracy
top_3_models = compare_models(sort='Accuracy', n_select=3)

# Get the results of the model comparison in a DataFrame format
model_results = pull()

# Print the names and accuracy of the top 3 models
for i, model in enumerate(top_3_models, start=1):
    model_name = type(model).__name__
    matched_name = model_name_to_abbr.get(model_name)
    accuracy = model_results.loc[model_results['Model'] == matched_name]['Accuracy'].values[0]
    print(f"Model {i}: {model_name}, Accuracy: {accuracy:.4f}")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5256,0.5334,0.5952,0.5403,0.566,0.0454,0.0457,0.232
rf,Random Forest Classifier,0.5248,0.5218,0.5435,0.5419,0.5417,0.0481,0.0484,0.202
xgboost,Extreme Gradient Boosting,0.5239,0.5207,0.5323,0.542,0.5363,0.0472,0.0472,0.06
dummy,Dummy Classifier,0.5206,0.5,1.0,0.5206,0.6847,0.0,0.0,0.049
knn,K Neighbors Classifier,0.5197,0.5383,0.5403,0.5385,0.538,0.0377,0.0379,0.109
lightgbm,Light Gradient Boosting Machine,0.5173,0.5256,0.5387,0.5369,0.5367,0.0328,0.0328,0.072
lr,Logistic Regression,0.5172,0.4908,0.9823,0.5191,0.679,-0.0057,-0.0172,0.1
ridge,Ridge Classifier,0.5122,0.0,0.6113,0.5282,0.5656,0.0158,0.0156,0.046
nb,Naive Bayes,0.5113,0.4679,0.95,0.5163,0.6687,-0.0155,-0.0176,0.069
lda,Linear Discriminant Analysis,0.5113,0.5042,0.6097,0.5276,0.5646,0.0142,0.0139,0.07


Processing:   0%|          | 0/67 [00:00<?, ?it/s]

Model 1: GradientBoostingClassifier, Accuracy: 0.5256
Model 2: RandomForestClassifier, Accuracy: 0.5248
Model 3: XGBClassifier, Accuracy: 0.5239


#The second result (around 52% accuracy) is the performance on the test data which was previously unseen by the model. These results show that the model is not generalizing well on the test data. Therefore, try best hyperparameter tuning and run the models again. Use tune_model() function

In [None]:
# Initialize pycaret
exp_clf = setup(data=processed_data, target='Trend', train_size=0.8, session_id=42, preprocess=False)

# Train and compare models
models_to_compare = ['gbc', 'rf', 'xgboost'] #Top 3 performed models based on the previous code.
tuned_models = []
for model in models_to_compare:
    base_model = create_model(model)
    tuned_model = tune_model(base_model)
    tuned_models.append(tuned_model)

# Test the best tuned model on the last 100 rows of the processed_data dataframe
test_data = processed_data.iloc[-100:]
predictions = predict_model(tuned_models[0], data=test_data)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Trend
2,Target type,Binary
3,Original data shape,"(1489, 16)"
4,Transformed data shape,"(1489, 16)"
5,Transformed train set shape,"(1191, 16)"
6,Transformed test set shape,"(298, 16)"
7,Numeric features,15


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5667,0.5862,0.629,0.5735,0.6,0.1295,0.1301
1,0.5462,0.5659,0.6774,0.5526,0.6087,0.0817,0.0842
2,0.4874,0.4665,0.5161,0.5079,0.512,-0.0278,-0.0278
3,0.5294,0.5207,0.5806,0.5455,0.5625,0.0545,0.0546
4,0.437,0.427,0.5,0.4627,0.4806,-0.132,-0.1325
5,0.5966,0.5869,0.6613,0.6029,0.6308,0.1884,0.1894
6,0.4454,0.4535,0.5,0.4697,0.4844,-0.1144,-0.1146
7,0.6134,0.635,0.6452,0.625,0.6349,0.2244,0.2246
8,0.479,0.4733,0.5968,0.5,0.5441,-0.0528,-0.0539
9,0.5546,0.6187,0.6452,0.5634,0.6015,0.102,0.1032


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6,0.5428,0.6774,0.6,0.6364,0.1955,0.1973
1,0.4622,0.4301,0.6129,0.4872,0.5429,-0.0899,-0.0934
2,0.4202,0.4278,0.4032,0.4386,0.4202,-0.1576,-0.1582
3,0.605,0.6098,0.5968,0.6271,0.6116,0.2104,0.2106
4,0.4538,0.4264,0.5,0.4769,0.4882,-0.0967,-0.0968
5,0.5294,0.5767,0.5484,0.5484,0.5484,0.0572,0.0572
6,0.4874,0.4394,0.5,0.5082,0.5041,-0.0263,-0.0263
7,0.5966,0.6135,0.6935,0.5972,0.6418,0.1861,0.1888
8,0.5462,0.5195,0.6129,0.5588,0.5846,0.087,0.0874
9,0.5378,0.5985,0.5645,0.5556,0.56,0.0733,0.0734


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.525,0.5546,0.5323,0.541,0.5366,0.0495,0.0495
1,0.5462,0.5252,0.6452,0.5556,0.597,0.0844,0.0856
2,0.5042,0.487,0.4839,0.5263,0.5042,0.0102,0.0102
3,0.5378,0.542,0.5806,0.5538,0.5669,0.072,0.0721
4,0.437,0.4041,0.4355,0.4576,0.4463,-0.1257,-0.1258
5,0.5294,0.5613,0.5161,0.5517,0.5333,0.0598,0.06
6,0.4622,0.4467,0.4355,0.4821,0.4576,-0.073,-0.0734
7,0.5966,0.5965,0.6774,0.6,0.6364,0.1873,0.189
8,0.5294,0.5265,0.5645,0.5469,0.5556,0.0558,0.0559
9,0.5798,0.5747,0.5645,0.6034,0.5833,0.1606,0.1609


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.525,0.483,0.5968,0.5362,0.5649,0.0452,0.0455
1,0.4958,0.5068,0.629,0.5132,0.5652,-0.0203,-0.0209
2,0.4874,0.4621,0.5161,0.5079,0.512,-0.0278,-0.0278
3,0.6218,0.6115,0.6774,0.6269,0.6512,0.2397,0.2405
4,0.4454,0.4454,0.5323,0.4714,0.5,-0.1175,-0.1186
5,0.5462,0.5959,0.5484,0.5667,0.5574,0.0921,0.0922
6,0.4874,0.461,0.5161,0.5079,0.512,-0.0278,-0.0278
7,0.5798,0.607,0.7097,0.5789,0.6377,0.1498,0.1542
8,0.5126,0.5006,0.6129,0.5278,0.5672,0.0165,0.0168
9,0.5294,0.5804,0.6774,0.5385,0.6,0.0464,0.0482


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.55,0.5514,0.5806,0.5625,0.5714,0.098,0.0981
1,0.479,0.4816,0.5161,0.5,0.5079,-0.0453,-0.0454
2,0.4706,0.4462,0.4677,0.4915,0.4793,-0.0584,-0.0585
3,0.5966,0.5617,0.6129,0.6129,0.6129,0.1919,0.1919
4,0.4958,0.4731,0.4516,0.5185,0.4828,-0.0045,-0.0045
5,0.563,0.5504,0.5968,0.5781,0.5873,0.1233,0.1233
6,0.4622,0.4516,0.4032,0.4808,0.4386,-0.07,-0.071
7,0.563,0.5707,0.629,0.5735,0.6,0.1208,0.1214
8,0.4706,0.4833,0.4839,0.4918,0.4878,-0.0599,-0.06
9,0.5882,0.6364,0.5806,0.6102,0.595,0.1768,0.177


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.55,0.5517,0.9677,0.5357,0.6897,0.0732,0.1426
1,0.4958,0.4675,0.8871,0.5093,0.6471,-0.0442,-0.0737
2,0.521,0.5082,0.9355,0.5225,0.6705,0.0059,0.0113
3,0.563,0.6089,0.9677,0.5455,0.6977,0.0937,0.1711
4,0.521,0.4677,0.9194,0.5229,0.6667,0.0073,0.0127
5,0.5714,0.5614,0.9194,0.5534,0.6909,0.1157,0.1645
6,0.5042,0.4533,0.871,0.5143,0.6467,-0.0245,-0.0369
7,0.521,0.5925,0.9032,0.5234,0.6627,0.0088,0.0141
8,0.5042,0.5153,0.9032,0.5138,0.655,-0.0275,-0.0479
9,0.5462,0.6469,0.9516,0.5364,0.686,0.0589,0.1075


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.89,0.9328,0.898,0.88,0.8889,0.78,0.7802


In [None]:
# Get the accuracy of each model
for i, model in enumerate(tuned_models):
    model_name = str(model).split("(")[0]
    accuracy = round(model.score(processed_data.drop('Trend', axis=1), processed_data['Trend']), 4)
    print(f"{model_name}: {accuracy}")

GradientBoostingClassifier: 0.7576
RandomForestClassifier: 0.8972
XGBClassifier: 0.7085


## the accuracy values were obtained after hyperparameter tuning and using the test dataset. The accuracy values for the top 3 models (GradientBoostingClassifier, RandomForestClassifier, and XGBClassifier) were:

GradientBoostingClassifier: 0.7576

RandomForestClassifier: 0.8972

XGBClassifier: 0.7085

These accuracy values suggest that RandomForestClassifier performed the best among the three models with an accuracy of 0.8972.

##Now, we are using hyperparameter tunned RandomForestClassifier with the accuracy of 89.72% to predict the tomorrow's trend of the stock for TESLA.

#Firstly, we will save the rf model and load it later since we checked it performed the best after hyperparameter tunning.

In [None]:
# Train and tune the RandomForestClassifier model
rf_model = tune_model(create_model('rf'))

# Save the trained and tuned RandomForestClassifier model
save_model(rf_model, 'Final RandomForest Classifier Model')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.525,0.5546,0.5323,0.541,0.5366,0.0495,0.0495
1,0.5462,0.5252,0.6452,0.5556,0.597,0.0844,0.0856
2,0.5042,0.487,0.4839,0.5263,0.5042,0.0102,0.0102
3,0.5378,0.542,0.5806,0.5538,0.5669,0.072,0.0721
4,0.437,0.4041,0.4355,0.4576,0.4463,-0.1257,-0.1258
5,0.5294,0.5613,0.5161,0.5517,0.5333,0.0598,0.06
6,0.4622,0.4467,0.4355,0.4821,0.4576,-0.073,-0.0734
7,0.5966,0.5965,0.6774,0.6,0.6364,0.1873,0.189
8,0.5294,0.5265,0.5645,0.5469,0.5556,0.0558,0.0559
9,0.5798,0.5747,0.5645,0.6034,0.5833,0.1606,0.1609


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.525,0.483,0.5968,0.5362,0.5649,0.0452,0.0455
1,0.4958,0.5068,0.629,0.5132,0.5652,-0.0203,-0.0209
2,0.4874,0.4621,0.5161,0.5079,0.512,-0.0278,-0.0278
3,0.6218,0.6115,0.6774,0.6269,0.6512,0.2397,0.2405
4,0.4454,0.4454,0.5323,0.4714,0.5,-0.1175,-0.1186
5,0.5462,0.5959,0.5484,0.5667,0.5574,0.0921,0.0922
6,0.4874,0.461,0.5161,0.5079,0.512,-0.0278,-0.0278
7,0.5798,0.607,0.7097,0.5789,0.6377,0.1498,0.1542
8,0.5126,0.5006,0.6129,0.5278,0.5672,0.0165,0.0168
9,0.5294,0.5804,0.6774,0.5385,0.6,0.0464,0.0482


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('clean_column_names',
                  TransformerWrapper(exclude=None, include=None,
                                     transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
                 ('trained_model',
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='sqrt',
                                         max_leaf_nodes=None, max_samples=None,
                                         min_impurity_decrease=0.0,
                                         min_samples_leaf=1, min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         n_estimators=100, n_jobs=-1,
                                         oob_score=False, random_state=42,
                                

In [None]:
ticker = "TSLA"  # Tesla stock symbol

# Get the historical stock data
raw_data = get_stock_data(ticker)
processed_data = add_technical_indicators(raw_data)
processed_data = create_target_and_preprocess(processed_data)
processed_data = processed_data.dropna()

# Load the saved model
rf_model = load_model('Final RandomForest Classifier Model')

# Get the latest stock data
latest_data = get_stock_data(ticker, years=0)
latest_processed_data = add_technical_indicators(latest_data)
latest_processed_data = latest_processed_data[['Close', 'Adj Close', 'RSI', 'OBV', 'CMF', 'Fast_Stochastic_Oscillator', 'SMA', 'EMA', 'Bollinger_High', 'Bollinger_Low', 'ADX']]
latest_processed_data = latest_processed_data.iloc[-20:]

# Make predictions for tomorrow's stock trend
tomorrow_pred = predict_model(rf_model, data=latest_processed_data)
print(f"Prediction for tomorrow's stock trend: {tomorrow_pred['Label'].values[0]}")

[*********************100%***********************]  1 of 1 completed
Transformation Pipeline and Model Successfully Loaded
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- TSLA: Data doesn't exist for startDate = 1681098744, endDate = 1681098744


KeyError: ignored

In [None]:
print(processed_data)

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2017-05-09   20.625334   21.466000   20.606667   21.417334   21.417334   
2017-05-10   21.437332   21.700001   21.208000   21.681334   21.681334   
2017-05-11   21.559999   21.733334   21.306667   21.540001   21.540001   
2017-05-12   21.698668   21.799999   21.435333   21.653999   21.653999   
2017-05-15   21.225332   21.346666   20.835333   21.058666   21.058666   
...                ...         ...         ...         ...         ...   
2023-03-31  197.529999  207.789993  197.199997  207.460007  207.460007   
2023-04-03  199.910004  202.690002  192.199997  194.770004  194.770004   
2023-04-04  197.320007  198.740005  190.320007  192.580002  192.580002   
2023-04-05  190.520004  190.679993  183.759995  185.520004  185.520004   
2023-04-06  183.080002  186.389999  179.740005  185.059998  185.059998   

               Volume         %K     

In [None]:
# Load the previously trained and tuned RandomForestClassifier model
rf_model = load_model('Final RandomForest Classifier Model')

# Make predictions for tomorrow's stock trend
tomorrow_pred = predict_model(rf_model, data=processed_data)
print(f"Prediction for tomorrow's stock trend: {tomorrow_pred['Label'].values[0]}")

Transformation Pipeline and Model Successfully Loaded


KeyError: ignored

In [None]:
X, y = prepare_data_for_feature_selection(processed_data, 'Trend')

In [None]:
k = 5  # Change this value to select a different number of top features
selector = SelectKBest(score_func=f_regression, k=k)
selector.fit(X, y)

cols = selector.get_support(indices=True)
selected_features = X.iloc[:, cols]
print("Top", k, "features:")
print(selected_features.columns)

Top 5 features:
Index(['Volume', '%K', 'RSI', 'ADX', 'CMF'], dtype='object')


In [None]:
# Create a new DataFrame with the selected features, target column, and 'Close' column
final_data = pd.concat([selected_features, processed_data[['Close', 'Trend']]], axis=1)

print("\nFinal data:")
print(final_data.head())


Final data:
               Volume         %K        RSI  ADX       CMF      Close  Trend
Date                                                                        
2017-05-09  145147500  82.655837  58.706452  0.0  0.038443  21.417334      1
2017-05-10   86124000  93.387545  60.977059  0.0  0.088723  21.681334      0
2017-05-11   71307000  87.642315  59.103379  0.0  0.136940  21.540001      1
2017-05-12   61824000  92.276398  60.166585  0.0  0.115903  21.653999      0
2017-05-15  114330000  68.075857  52.491781  0.0  0.103686  21.058666      1


In [None]:
processed_data.head(100)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,%K,RSI,SMA,EMA,Bollinger_High,Bollinger_Low,ADX,OBV,CMF,Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-04-24,47.387333,48.715332,46.545334,48.343334,48.343334,198180000,82.021013,67.403192,41.086001,43.062072,55.379258,26.792743,0.000000,2373435000,0.016330,1
2020-04-27,49.174000,53.299332,49.000000,53.250000,53.250000,310221000,99.723011,73.938099,42.033967,44.032351,56.902051,27.165883,0.000000,2683656000,0.062100,0
2020-04-28,53.042667,53.666668,50.445999,51.274666,51.274666,228330000,86.792790,68.025545,42.923934,44.722095,57.767108,28.080760,0.000000,2455326000,0.045515,1
2020-04-29,52.678001,53.546665,52.210667,53.367332,53.367332,243240000,98.188700,70.698777,43.845634,45.545451,58.877755,28.813512,0.000000,2698566000,0.070513,0
2020-04-30,57.012669,57.987999,50.900002,52.125332,52.125332,427078500,69.601435,67.112503,44.846700,46.172106,59.272151,30.421250,0.000000,2271487500,0.041225,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-09,118.866669,123.000000,113.836670,122.093330,122.093330,238397400,21.087995,48.200243,132.783834,130.157518,166.046172,99.521496,39.086220,5078734800,0.176428,1
2020-09-10,128.736664,132.996674,120.186668,123.779999,123.779999,254791800,24.019462,49.037247,133.790300,129.550135,164.592845,102.987756,36.495480,5333526600,0.123258,1
2020-09-11,127.313332,127.500000,120.166664,124.239998,124.239998,182152500,24.818951,49.277964,134.598967,129.044408,163.444791,105.753143,34.087967,5515679100,0.113930,1
2020-09-14,126.983330,140.000000,124.433334,139.873337,139.873337,249061800,51.990036,56.754137,136.090267,130.075734,162.702161,109.478373,32.847327,5764740900,0.152542,1


In [None]:
processed_data.to_csv("tesla_processed_data.csv", index=False)

In [None]:
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [None]:
X = final_data.drop('Trend', axis=1)
y = final_data['Trend']

In [None]:
# # Prepare the dataset
# X = processed_data.drop('Trend', axis=1)
# y = processed_data['Trend']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)


In [None]:
# Train the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = clf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5102040816326531
Accuracy: 51.02%
Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.52      0.47        61
           1       0.60      0.50      0.54        86

    accuracy                           0.51       147
   macro avg       0.51      0.51      0.51       147
weighted avg       0.53      0.51      0.51       147



In [None]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 57.82%


In [None]:
print("Target variable distribution:")
print(y.value_counts(normalize=True) * 100)

Target variable distribution:
1    53.605442
0    46.394558
Name: Trend, dtype: float64


In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression

tscv = TimeSeriesSplit(n_splits=5)

X1 = X.values
y1 = y.values

for train_index, test_index in tscv.split(X1):
    X_train1, X_test1 = X1[train_index], X1[test_index]
    y_train1, y_test1 = y1[train_index], y1[test_index]
    
    # Train the Logistic Regression classifier
    clf = LogisticRegression(random_state=42)
    clf.fit(X_train1, y_train1)

    # Make predictions
    y_pred = clf.predict(X_test1)

    # Evaluate the model
    accuracy = accuracy_score(y_test1, y_pred)
    print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 46.72%
Accuracy: 56.56%
Accuracy: 53.28%
Accuracy: 59.84%
Accuracy: 46.72%


In [None]:
# Predict tomorrow's trend
tomorrow_data = X1[-1]
tomorrow_pred = clf.predict([tomorrow_data])

# Print tomorrow's estimated trend along with the data row
print("\nTomorrow's Data:")
print(tomorrow_data)
print("\nTomorrow's Estimated Trend:", 'Uptrend' if tomorrow_pred[0] == 1 else 'Downtrend')


Tomorrow's Data:
[1.88587000e+02 1.87857383e+02 2.08640875e+02 1.68533125e+02
 1.69105047e-02 1.90410004e+02]

Tomorrow's Estimated Trend: Uptrend
