## Import required libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

## Load the data

In [2]:
X_train_scaled = pd.read_csv('../data/processed/X_train_scaled.csv')
X_test_scaled = pd.read_csv('../data/processed/X_test_scaled.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
y_test = pd.read_csv('../data/processed/y_test.csv')

## Logistic Regression

In [3]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

  y = column_or_1d(y, warn=True)


In [4]:
#with open('../data/models/version8_logisticreg(0.61).pkl', 'wb') as file:
#    pickle.dump(model, file)

In [5]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.60


In [6]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Classification Report:
              precision    recall  f1-score   support

       False       0.78      0.43      0.56       471
        True       0.51      0.83      0.63       335

    accuracy                           0.60       806
   macro avg       0.64      0.63      0.59       806
weighted avg       0.67      0.60      0.59       806



## Random Forrest Classifier

In [7]:
# Train the random forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

  return fit_method(estimator, *args, **kwargs)


In [8]:
#with open('../data/models/version4_randomforest(0.68).pkl', 'wb') as file:
#    pickle.dump(model, file)

In [9]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.70


In [10]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Classification Report:
              precision    recall  f1-score   support

       False       0.71      0.84      0.77       471
        True       0.70      0.51      0.59       335

    accuracy                           0.70       806
   macro avg       0.70      0.68      0.68       806
weighted avg       0.70      0.70      0.69       806



## Support Vector Machines

In [11]:
model = SVC()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

  y = column_or_1d(y, warn=True)


In [12]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.59


In [13]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Classification Report:
              precision    recall  f1-score   support

       False       0.59      0.99      0.74       471
        True       0.73      0.02      0.05       335

    accuracy                           0.59       806
   macro avg       0.66      0.51      0.39       806
weighted avg       0.65      0.59      0.45       806



## k-Nearest Neighbors (k-NN)

In [14]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

  return self._fit(X, y)


In [15]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.66


In [16]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Classification Report:
              precision    recall  f1-score   support

       False       0.69      0.75      0.72       471
        True       0.60      0.54      0.57       335

    accuracy                           0.66       806
   macro avg       0.65      0.64      0.64       806
weighted avg       0.65      0.66      0.66       806



## XGBoost

In [24]:
model = xgb.XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [25]:
#with open('../data/models/version13_xgboost(0.74).pkl', 'wb') as file:
#    pickle.dump(model, file)

In [19]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.74


In [20]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Classification Report:
              precision    recall  f1-score   support

       False       0.78      0.76      0.77       471
        True       0.68      0.70      0.69       335

    accuracy                           0.74       806
   macro avg       0.73      0.73      0.73       806
weighted avg       0.74      0.74      0.74       806



## Neural Networks (MLP)

In [21]:
model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)


  y = column_or_1d(y, warn=True)


In [22]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.58


In [23]:
# Classification Report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Classification Report:
              precision    recall  f1-score   support

       False       0.68      0.53      0.60       471
        True       0.50      0.65      0.56       335

    accuracy                           0.58       806
   macro avg       0.59      0.59      0.58       806
weighted avg       0.61      0.58      0.58       806



## Metric Evaluation

- Imbalanced Classes: If the classes (up or down) are imbalanced, accuracy might not be a good metric. Precision, recall, and F1 score would be more informative.

- Trading Strategy Impact: 

1. If false positives (predicting the stock will go up when it actually goes down) are more costly, you might prioritize precision. 

2. If false negatives (predicting the stock will go down when it actually goes up) are more costly, recall might be more important.