<a href="https://colab.research.google.com/github/jmohsbeck1/jpmc_mle/blob/Mar.-29/Stock_Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
# John Mohsbeck
# Stock Predictor for JPMC stock ticker: JPM
# Source Dr. Lee stock predictor

# Import libraries
import yfinance as yf
import pandas as pd

In [41]:
# Fetch JPM stock data function

def fetch_stock_data(tickers=["JPM"], start="2000-01-01", end="2021-12-31"):
    """
    Fetches stock data for the specified tickers and time period using the yfinance library.

    Parameters:
    tickers (list): A list of stock tickers (default is ["JPM"]).
    start (str): The start date for fetching data in the format YYYY-MM-DD (default is "2000-01-01").
    end (str): The end date for fetching data in the format YYYY-MM-DD (default is "2021-12-31").

    Returns:
    data (pd.DataFrame): A pandas DataFrame containing the fetched stock data.
    """
    
    # Combine data for all tickers into a single DataFrame
    data = pd.DataFrame()
    for ticker in tickers:
        stock_data = yf.download(ticker, start=start, end=end)
        stock_data["Ticker"] = ticker
        data = data.append(stock_data, sort=True)

    # Reset the index and return the final DataFrame
    data.reset_index(inplace=True)
    return data

Fetching data and discovering insights

In [50]:
stock_data = fetch_stock_data(tickers=["JPM"], start="2010-01-01", end="2023-3-27")
print(stock_data.head())

[*********************100%***********************]  1 of 1 completed
        Date  Adj Close      Close       High        Low       Open Ticker  \
0 2010-01-04  30.517252  42.849998  42.990002  41.669998  41.790001    JPM   
1 2010-01-05  31.108358  43.680000  43.840000  42.779999  42.790001    JPM   
2 2010-01-06  31.279287  43.919998  44.090000  43.310001  43.450001    JPM   
3 2010-01-07  31.898897  44.790001  45.119999  43.610001  43.790001    JPM   
4 2010-01-08  31.820559  44.680000  44.700001  44.080002  44.369999    JPM   

     Volume  
0  35460500  
1  41208300  
2  27729000  
3  44864700  
4  33110100  


  data = data.append(stock_data, sort=True)


Show data when JPM is at Min stock price: 28.38

In [51]:
print(stock_data.loc[stock_data['Close'].idxmin()])

Date         2011-11-23 00:00:00
Adj Close              20.728109
Close                  28.379999
High                       29.15
Low                        28.33
Open                   29.110001
Ticker                       JPM
Volume                  44468500
Name: 478, dtype: object


Show data when JPM is at Max stock price: 171.78

In [52]:
print(stock_data.loc[stock_data['Close'].idxmax()])

Date         2021-10-22 00:00:00
Adj Close             165.302963
Close                 171.779999
High                  172.089996
Low                   169.699997
Open                  170.029999
Ticker                       JPM
Volume                   8817900
Name: 2972, dtype: object


Adding a Target Column: Decoding the Market’s Swings

In [53]:
def add_target_column(data, target_col="Target"):
    """
    Adds a target column to the input DataFrame, indicating whether the previous day's closing price was up or down.

    Parameters:
    data (pd.DataFrame): The input DataFrame containing stock data.
    target_col (str): The name of the target column to be added (default is "Target").

    Returns:
    data (pd.DataFrame): The DataFrame with the new target column added.
    """
    
    # Calculate the difference between consecutive closing prices
    data["Price_Diff"] = data["Close"].diff()
    
    # Assign "UP" or "DOWN" based on the sign of the price difference
    data[target_col] = data["Price_Diff"].apply(lambda x: "UP" if x > 0 else "DOWN")
    
    # Drop the first row since it has no previous day to compare to, and the "Price_Diff" column
    data = data.drop(0).drop(columns=["Price_Diff"])
    
    return data

In [54]:
stock_data_with_target = add_target_column(stock_data)
print(stock_data_with_target.head())

        Date  Adj Close      Close       High        Low       Open Ticker  \
1 2010-01-05  31.108358  43.680000  43.840000  42.779999  42.790001    JPM   
2 2010-01-06  31.279287  43.919998  44.090000  43.310001  43.450001    JPM   
3 2010-01-07  31.898897  44.790001  45.119999  43.610001  43.790001    JPM   
4 2010-01-08  31.820559  44.680000  44.700001  44.080002  44.369999    JPM   
5 2010-01-11  31.713709  44.529999  45.189999  44.310001  45.119999    JPM   

     Volume Target  
1  41208300     UP  
2  27729000     UP  
3  44864700     UP  
4  33110100   DOWN  
5  31878700   DOWN  


In [55]:
stock_data_with_target.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Ticker,Volume,Target
1,2010-01-05,31.108358,43.68,43.84,42.779999,42.790001,JPM,41208300,UP
2,2010-01-06,31.279287,43.919998,44.09,43.310001,43.450001,JPM,27729000,UP
3,2010-01-07,31.898897,44.790001,45.119999,43.610001,43.790001,JPM,44864700,UP
4,2010-01-08,31.820559,44.68,44.700001,44.080002,44.369999,JPM,33110100,DOWN
5,2010-01-11,31.713709,44.529999,45.189999,44.310001,45.119999,JPM,31878700,DOWN


In [56]:
stock_data_with_target.Target.value_counts()

UP      1682
DOWN    1646
Name: Target, dtype: int64

JPM stock UP: 50.5%

JPM stock Down: 49.4%

In [57]:
stock_data_with_target.Target.value_counts("normalize")

UP      0.505409
DOWN    0.494591
Name: Target, dtype: float64

Encoding and Preprocessing: Gearing Up for Machine Learning

In [58]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [59]:
def encode_and_preprocess(data, categorical_cols=None, numerical_cols=None, target_col="Target"):
    """
    Encodes categorical variables and standardizes/normalizes numerical variables in the input DataFrame.

    Parameters:
    data (pd.DataFrame): The input DataFrame containing stock data.
    categorical_cols (list): A list of categorical column names to be encoded (default is None).
    numerical_cols (list): A list of numerical column names to be standardized/normalized (default is None).
    target_col (str): The name of the target column (default is "Target").

    Returns:
    data (pd.DataFrame): The preprocessed DataFrame.
    """
    
    # Make a copy of the input DataFrame to avoid modifying the original data
    data = data.copy()
    
    # Encode categorical columns
    if categorical_cols:
        le = LabelEncoder()
        for col in categorical_cols:
            data[col] = le.fit_transform(data[col])
    
    # Standardize/normalize numerical columns
    if numerical_cols:
        scaler = StandardScaler()
        data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
    
    # Ensure the target column is the last column in the DataFrame
    data = data[[col for col in data.columns if col != target_col] + [target_col]]
    
    return data

In [60]:
preprocessed_data = encode_and_preprocess(stock_data_with_target, categorical_cols=["Ticker"], numerical_cols=["Open", "High", "Low", "Close", "Adj Close", "Volume"])
print(preprocessed_data.head())

        Date  Adj Close     Close      High       Low      Open  Ticker  \
1 2010-01-05  -1.045125 -1.032403 -1.039541 -1.044060 -1.055611       0   
2 2010-01-06  -1.040806 -1.026088 -1.033023 -1.029988 -1.038248       0   
3 2010-01-07  -1.025149 -1.003196 -1.006173 -1.022023 -1.029303       0   
4 2010-01-08  -1.027128 -1.006090 -1.017121 -1.009544 -1.014045       0   
5 2010-01-11  -1.029828 -1.010037 -1.004348 -1.003438 -0.994314       0   

     Volume Target  
1  1.479616     UP  
2  0.512753     UP  
3  1.741887     UP  
4  0.898736   DOWN  
5  0.810408   DOWN  


Algorithm Harness: evaluate_classifiers


In [61]:
import time
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [62]:
from sklearn.preprocessing import LabelEncoder

def evaluate_classifiers(data, target_col="Target", test_size=0.2, random_state=42, exclude_columns=None):
    """
    Trains and evaluates various classifiers on the input DataFrame, displaying performance metrics and training time.

    Parameters:
    data (pd.DataFrame): The preprocessed DataFrame containing stock data.
    target_col (str): The name of the target column (default is "Target").
    test_size (float): The proportion of the dataset to include in the test split (default is 0.2).
    random_state (int): A random seed for reproducibility (default is 42).
    exclude_columns (list): A list of column names to be excluded from the features (default is None).

    Returns:
    performance_table (pd.DataFrame): A DataFrame displaying each algorithm's name, training time, and performance metrics.
    """

    # Exclude the specified columns from the features, if provided
    if exclude_columns:
        feature_columns = [col for col in data.columns if col not in exclude_columns + [target_col]]
    else:
        feature_columns = [col for col in data.columns if col != target_col]

    # Split the data into training and testing sets
    X = data[feature_columns]
    y = data[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Encode the target column
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    # Define the classifiers to be evaluated
    classifiers = [
        ("Logistic Regression", LogisticRegression(random_state=random_state)),
        ("Decision Tree", DecisionTreeClassifier(random_state=random_state)),
        ("Random Forest", RandomForestClassifier(random_state=random_state)),
        ("Support Vector Machine", SVC(random_state=random_state, probability=True)),
        ("K-Nearest Neighbors", KNeighborsClassifier()),
        ("XGBoost", XGBClassifier(random_state=random_state)),
        ("LightGBM", LGBMClassifier(random_state=random_state)),
    ]

    # Evaluate each classifier and store the results in a list
    results = []
    for name, classifier in classifiers:
        start_time = time.time()
        classifier.fit(X_train, y_train)
        end_time = time.time()

        # Compute the performance metrics
        y_pred = classifier.predict(X_test)
        y_proba = classifier.predict_proba(X_test)[:, 1]
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)

        results.append([name, end_time - start_time, accuracy, precision, recall, f1, roc_auc])
    # Create a DataFrame from the results and sort it by the best AUC/ROC score
    performance_table = pd.DataFrame(results, columns=["Classifier", "Training Time", "Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC"])
    performance_table = performance_table.sort_values(by="ROC AUC", ascending=False).reset_index(drop=True)

    return performance_table


In [64]:
performance_table = evaluate_classifiers(preprocessed_data, exclude_columns=["Date"])
performance_table

Unnamed: 0,Classifier,Training Time,Accuracy,Precision,Recall,F1 Score,ROC AUC
0,Logistic Regression,0.019745,0.761261,0.725248,0.859238,0.786577,0.855556
1,XGBoost,0.386345,0.693694,0.694051,0.718475,0.706052,0.781719
2,LightGBM,0.117631,0.6997,0.70317,0.715543,0.709302,0.770589
3,Random Forest,1.017904,0.647147,0.652299,0.665689,0.658926,0.720388
4,Decision Tree,0.058506,0.648649,0.655072,0.662757,0.658892,0.648301
5,Support Vector Machine,2.941879,0.56006,0.5553,0.706745,0.621935,0.565757
6,K-Nearest Neighbors,0.004466,0.531532,0.543544,0.530792,0.537092,0.55473
