In [3]:
# Import the required libraries and dependencies
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#READ IN DATA
df= pd.read_csv("Resources/dataset.csv", index_col="2021 - 2023 AAPL Daily Data (Index)", infer_datetime_format=True, parse_dates=True)



Index(['EBIDTA', 'EV/EBITDA', 'EPS', 'P/E', '1 YR', '5 YR', '30 YR', '50 MA',
       '200 MA', 'News', 'News Score', 'Percent Change',
       'Target - B/H/S (based on close - daily % change)'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0_level_0,EBIDTA,EV/EBITDA,EPS,P/E,1 YR,5 YR,30 YR,50 MA,200 MA,News,News Score,Percent Change,Target - B/H/S (based on close - daily % change)
2021 - 2023 AAPL Daily Data (Index),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-06-30,31260000000,100.250802,1.53,126.777779,0.08,0.85,2.04,176.6054,154.7733,POSITIVE,0.748121,0.023103,Buy
2023-06-29,31260000000,98.047712,1.53,123.91503,0.08,0.8,1.98,176.0786,154.57575,POSITIVE,0.748121,0.001797,Buy
2023-06-28,31260000000,97.876698,1.53,123.69281,0.07,0.78,1.92,175.6162,154.4076,POSITIVE,0.999122,0.006328,Buy
2023-06-27,31260000000,97.278141,1.53,122.915031,0.08,0.79,1.93,175.1358,154.234,POSITIVE,0.977651,0.015059,Buy
2023-06-26,31260000000,95.874808,1.53,121.091506,0.07,0.7,1.81,174.6788,154.07275,POSITIVE,0.691947,-0.007553,Sell


In [11]:
#Change news sentiment to numeric values
df['News'] = df['News'].replace({'POSITIVE': 1, 'NEGATIVE': -1, 'NEUTRAL': 0})


In [12]:
#Change 'Target - B/H/S (based on close - daily % change)' to numeric values
df['Target - B/H/S (based on close - daily % change)'] = df['Target - B/H/S (based on close - daily % change)'].replace({'Buy': 1, 'Hold': 0, 'Sell': -1})

In [13]:
df.head()

Unnamed: 0_level_0,EBIDTA,EV/EBITDA,EPS,P/E,1 YR,5 YR,30 YR,50 MA,200 MA,News,News Score,Percent Change,Target - B/H/S (based on close - daily % change)
2021 - 2023 AAPL Daily Data (Index),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-06-30,31260000000,100.250802,1.53,126.777779,0.08,0.85,2.04,176.6054,154.7733,1,0.748121,0.023103,1
2023-06-29,31260000000,98.047712,1.53,123.91503,0.08,0.8,1.98,176.0786,154.57575,1,0.748121,0.001797,1
2023-06-28,31260000000,97.876698,1.53,123.69281,0.07,0.78,1.92,175.6162,154.4076,1,0.999122,0.006328,1
2023-06-27,31260000000,97.278141,1.53,122.915031,0.08,0.79,1.93,175.1358,154.234,1,0.977651,0.015059,1
2023-06-26,31260000000,95.874808,1.53,121.091506,0.07,0.7,1.81,174.6788,154.07275,1,0.691947,-0.007553,-1


In [14]:
# Select features and target variable
X = df.drop(columns=["Target - B/H/S (based on close - daily % change)"])
y = df["Target - B/H/S (based on close - daily % change)"]

# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Create a pipeline for preprocessing and modeling
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression())
])

# Define the hyperparameters grid for grid search
param_grid = {
    "model__C": [0.1, 1.0, 10.0],
    "model__penalty": ["l1", "l2"],
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the model on training data
y_train_pred = best_model.predict(X_train)
train_report = classification_report(y_train, y_train_pred)
print("Training Report:")
print(train_report)

# Evaluate the model on testing data
y_test_pred = best_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred)
print("Testing Report:")
print(test_report)

Training Report:
              precision    recall  f1-score   support

          -1       0.99      1.00      1.00       196
           1       1.00      0.99      1.00       198

    accuracy                           1.00       394
   macro avg       1.00      1.00      1.00       394
weighted avg       1.00      1.00      1.00       394

Testing Report:
              precision    recall  f1-score   support

          -1       0.93      1.00      0.97        43
           1       1.00      0.95      0.97        56

    accuracy                           0.97        99
   macro avg       0.97      0.97      0.97        99
weighted avg       0.97      0.97      0.97        99



9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Users\Roberts\anaconda3\envs\dev\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Users\Roberts\anaconda3\envs\dev\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "d:\Users\Roberts\anaconda3\envs\dev\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Users\Roberts\anaconda3\envs\dev\lib\site-packages\sk