In [1]:
pip install yfinance numpy


Collecting yfinance
  Obtaining dependency information for yfinance from https://files.pythonhosted.org/packages/14/03/3c854ca3e02eedf614abba4b2e177c469bf3af58207fa30d5098c5d652fe/yfinance-0.2.37-py2.py3-none-any.whl.metadata
  Downloading yfinance-0.2.37-py2.py3-none-any.whl.metadata (11 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Obtaining dependency information for multitasking>=0.0.7 from https://files.pythonhosted.org/packages/3e/8a/bb3160e76e844db9e69a413f055818969c8acade64e1a9ac5ce9dfdcf6c1/multitasking-0.0.11-py3-none-any.whl.metadata
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.0.tar.gz (314 kB)
     ---------------------------------------- 0.0/314.6 kB ? eta -:--:--
     - -------------------------------------- 10.2/314.6 kB ? eta -:--:--
     ------------- ------------------------ 112.6/314.6 kB 1.3 MB/s eta 0:00:01
     --------------------- ---------------- 174.1/314.

In [2]:
import yfinance as yf

spy_data = yf.download("SPY", start="2015-01-01", end="2021-12-31")


[*********************100%%**********************]  1 of 1 completed


In [3]:
import numpy as np

target_variable = np.where(spy_data['Close'].shift(-1) > spy_data['Close'], 1, -1)


In [4]:
spy_data.isnull().sum() #checking for null values


Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [5]:
spy_data.dropna(inplace=True) #dropping the columns if there is a null value, even though the prev output told us theres no null value


In [6]:
spy_data.isnull().sum()


Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [7]:
duplicate_rows = spy_data.duplicated() #checking for duplicate rows


In [9]:
spy_data = spy_data[~duplicate_rows]


In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
spy_data_scaled = scaler.fit_transform(spy_data)


In [11]:
#defining X features
X = spy_data[['Open', 'Close', 'High', 'Low', 'Volume']]

#defining y target variable
y = target_variable 

assert len(X) == len(y), "Number of samples in X and y must be equal"


In [12]:
from sklearn.model_selection import train_test_split

#Spliting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#checking the shapes of the resulting datasets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Training set shape: (1409, 5) (1409,)
Test set shape: (353, 5) (353,)


In [13]:
from sklearn.linear_model import LogisticRegression

#establashing the classifier
classifier = LogisticRegression()

#fitting the classifier on the training dataset
classifier.fit(X_train, y_train)


In [14]:
#evaluating the classifier on the test dataset
accuracy = classifier.score(X_test, y_test)
print("Accuracy on test set:", accuracy)


Accuracy on test set: 0.546742209631728


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [18]:
classifier1 = LogisticRegression()
classifier2 = DecisionTreeClassifier()

classifier1.fit(X_train, y_train)
classifier2.fit(X_train, y_train)

y_pred1 = classifier1.predict(X_test)
y_pred2 = classifier2.predict(X_test)



In [19]:
#calculating evaluation metrics
accuracy1 = accuracy_score(y_test, y_pred1)
accuracy2 = accuracy_score(y_test, y_pred2)

precision1 = precision_score(y_test, y_pred1)
precision2 = precision_score(y_test, y_pred2)

recall1 = recall_score(y_test, y_pred1)
recall2 = recall_score(y_test, y_pred2)

f1_score1 = f1_score(y_test, y_pred1)
f1_score2 = f1_score(y_test, y_pred2)



In [20]:
print("Classifier 1:")
print("Accuracy:", accuracy1)
print("Precision:", precision1)
print("Recall:", recall1)
print("F1-score:", f1_score1)

print("\nClassifier 2:")
print("Accuracy:", accuracy2)
print("Precision:", precision2)
print("Recall:", recall2)
print("F1-score:", f1_score2)


Classifier 1:
Accuracy: 0.546742209631728
Precision: 0.546742209631728
Recall: 1.0
F1-score: 0.706959706959707

Classifier 2:
Accuracy: 0.4730878186968839
Precision: 0.5177664974619289
Recall: 0.5284974093264249
F1-score: 0.523076923076923


### Conclusion 

In [None]:
#Classifier 1 has a higher accuracy (0.547) compared to Classifier 2 (0.473). 
#This means that Classifier 1 makes more correct predictions overall on the test dataset.

#However, Classifier 1 has a lower precision (0.547) compared to Classifier 2 (0.518). 
#Precision measures the proportion of true positive predictions among all positive predictions. 
#A higher precision indicates fewer false positives.

#Classifier 1 has a perfect recall (1.0), meaning it correctly identifies all positive samples. 
#On the other hand, Classifier 2 has a lower recall (0.528), indicating that it misses some positive samples.

#The F1-score, which considers both precision and recall, is higher for Classifier 1 (0.707) compared to Classifier 2 (0.523). 
#This suggests that Classifier 1 achieves a better balance between precision and recall.

#In conclusion, while Classifier 1 has a higher accuracy and F1-score, indicating overall better performance,
#it also has a lower precision compared to Classifier 2. 