<a href="https://colab.research.google.com/github/jjustinsong/stock-predictor/blob/main/random_forest_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import sklearn
print(sklearn.__version__)

1.3.2


In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

In [2]:
!pip install pandas_ta

Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pandas_ta
  Building wheel for pandas_ta (setup.py) ... [?25l[?25hdone
  Created wheel for pandas_ta: filename=pandas_ta-0.3.14b0-py3-none-any.whl size=218909 sha256=ed537b7e97cc4aabdcea7756a3aa73ddfbbef105068a8b522c2c2a5fa56837e5
  Stored in directory: /root/.cache/pip/wheels/69/00/ac/f7fa862c34b0e2ef320175100c233377b4c558944f12474cf0
Successfully built pandas_ta
Installing collected packages: pandas_ta
Successfully installed pandas_ta-0.3.14b0


In [3]:
import yfinance as yf
import requests
from bs4 import BeautifulSoup
import argparse
import datetime
import pandas_ta as ta

In [4]:
def fetch_news(ticker):
    stock = yf.Ticker(ticker)
    news = stock.news
    df = pd.DataFrame(news)
    df['datetime'] = pd.to_datetime(df['providerPublishTime'], unit='s')
    df = df.drop(columns=['uuid', 'publisher', 'link', 'type', 'thumbnail', 'relatedTickers', 'providerPublishTime'])
    return df

def fetch_indicators(ticker):
    stock = yf.Ticker(ticker)
    hist = stock.history(period="1mo", interval='5m')
    hist['RSI'] = ta.rsi(hist['Close'], length=14)

    # Using pandas_ta to calculate MACD
    macd = hist.ta.macd(close='Close', fast=12, slow=26, signal=9)
    hist = pd.concat([hist, macd], axis=1)

    hist = hist.reset_index()
    hist.rename(columns={'Datetime': 'datetime'}, inplace=True)
    return hist[['datetime', 'Close', 'RSI', 'MACD_12_26_9', 'MACDs_12_26_9', 'MACDh_12_26_9']]

def combine(ticker):
    news = fetch_news(ticker)
    indicators = fetch_indicators(ticker)

    news['datetime'] = news['datetime'].dt.tz_localize(None)
    indicators['datetime'] = indicators['datetime'].dt.tz_localize(None)

    combined_df = pd.merge_asof(news.sort_values('datetime'), indicators.sort_values('datetime'), on='datetime', direction='backward')
    return combined_df

In [5]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import yfinance as yf
import pandas_ta as ta

def fetch_indicators(ticker):
    stock = yf.Ticker(ticker)
    hist = stock.history(period="1mo", interval='5m')
    hist['RSI'] = ta.rsi(hist['Close'], length=14)

    # Using pandas_ta to calculate MACD
    macd = hist.ta.macd(close='Close', fast=12, slow=26, signal=9)
    hist = pd.concat([hist, macd], axis=1)

    hist = hist.reset_index()
    hist.rename(columns={'Datetime': 'datetime'}, inplace=True)

    # Calculate the label 'y' as 0 (decrease), 1 (increase), or 2 (neutral)
    hist['y'] = hist['Close'].diff(-1).apply(lambda x: 0 if x > 0 else (1 if x < 0 else 2))

    # Drop the last row as it has a NaN value for 'y'
    hist = hist.dropna()

    return hist[['datetime', 'Close', 'RSI', 'MACD_12_26_9', 'MACDs_12_26_9', 'MACDh_12_26_9', 'y']]

def fetch_news_api(ticker, api_key):
    # Ensure the date is formatted correctly
    date_str = (datetime.now() - timedelta(days=14)).strftime('%Y-%m-%d')
    url_base = f'https://newsapi.org/v2/everything?q={ticker}&from={date_str}&sortBy=publishedAt&language=en&apiKey={api_key}'

    news_data = []
    page = 1
    total_pages = 1  # Assume there's at least one page

    while page <= total_pages:
        url = f"{url_base}&page={page}"
        response = requests.get(url)
        response_json = response.json()

        # Check if this is the first request to determine the total pages available
        if page == 1:
            total_results = response_json.get('totalResults', 0)
            total_pages = (total_results // 20) + 1  # Assuming default pageSize is 20, adjust as per actual pageSize

        articles = response_json.get('articles', [])

        for article in articles:
            # Filter out headlines with non-ASCII characters
            if all(ord(char) < 128 for char in article['title']):
                news_data.append({
                    'datetime': pd.to_datetime(article['publishedAt']),
                    'headline': article['title']
                })

        page += 1  # Increment to fetch the next page

    df = pd.DataFrame(news_data)
    return df

def combine_with_api_news(ticker, api_key):
    api_news = fetch_news_api(ticker, api_key)  # Fetch news using the API
    indicators = fetch_indicators(ticker) # Fetch indicators using the function defined above

    api_news['datetime'] = api_news['datetime'].dt.tz_localize(None)
    indicators['datetime'] = indicators['datetime'].dt.tz_localize(None)

    combined_df = pd.merge_asof(api_news.sort_values('datetime'), indicators.sort_values('datetime'), on='datetime', direction='backward')
    return combined_df

# Example stock ticker
ticker = "AAPL"
api_key = "2021f3285fde4a68bdc8c53d9a103e00"

In [33]:
combine = combine_with_api_news(ticker, api_key)
combine

Unnamed: 0,datetime,headline,Close,RSI,MACD_12_26_9,MACDs_12_26_9,MACDh_12_26_9,y
0,2024-08-02 03:18:47,Apple Inc. (NASDAQ:AAPL) Shares Sold by Capita...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
1,2024-08-02 03:18:47,Apple Inc. (NASDAQ:AAPL) Shares Sold by Eos Ma...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
2,2024-08-02 03:18:47,Investments & Financial Planning LLC Increases...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
3,2024-08-02 03:18:48,Apple Inc. (NASDAQ:AAPL) Shares Sold by MontVu...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
4,2024-08-02 03:18:49,Apple Inc. (NASDAQ:AAPL) Shares Bought by Paul...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
...,...,...,...,...,...,...,...,...
340,2024-08-12 14:05:20,Dow Jones Falls 200 Points As Inflation Report...,216.619995,36.015020,-0.309908,-0.233355,-0.076553,1
341,2024-08-12 14:23:26,AI Could Help the iPhone 16 Drive an Apple Ren...,216.615005,37.877469,-0.303865,-0.265764,-0.038101,1
342,2024-08-12 16:09:00,"Big Tech earnings proved the AI hype is real, ...",217.550003,60.537040,-0.043584,-0.082543,0.038959,1
343,2024-08-12 18:50:00,"I inherited $50,000. Is there a single stock I...",217.550003,60.537040,-0.043584,-0.082543,0.038959,1


In [7]:
import torch
import torch.nn as nn

class SentimentAnalysisBidirectionalLSTMTemperature(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, n_layers, dropout, pretrained_embedding, init_temp=1.0):
        super(SentimentAnalysisBidirectionalLSTMTemperature, self).__init__()

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 3)

        self.temperature = nn.Parameter(torch.ones(1) * init_temp)

    def forward(self, x, hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out[:, -1, :]
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.temperature_scale(out)

        return out, hidden

    def init_hidden(self, batch_size, device):
        h0 = torch.zeros((self.n_layers * 2, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.n_layers * 2, batch_size, self.hidden_dim)).to(device)
        hidden = (h0, c0)
        return hidden

    def temperature_scale(self, logits):
        temperature = self.temperature.unsqueeze(1).expand(logits.size(0), logits.size(1))
        return logits / temperature


In [8]:
import re
import numpy as np
import torchtext.vocab as vocab

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
glove = vocab.GloVe(name='6B', dim=100)

embedding_matrix = torch.load('glove_embeddings.pt')

sentiment_analyzer = SentimentAnalysisBidirectionalLSTMTemperature(
    embedding_dim=100,
    hidden_dim=256,
    n_layers=2,
    dropout=0.5,
    pretrained_embedding=embedding_matrix,
    init_temp=7.0
)

sentiment_analyzer.to(device)
sentiment_analyzer.load_state_dict(torch.load('combined_model_weights.pth', map_location=device))
sentiment_analyzer.eval()


def predict_text(text, model, max_length):
    def preprocess(s):
        # Remove all non-word characters (everything except numbers and letters)
        s = re.sub(r"[^\w\s]", '', s)
        # Replace all runs of whitespaces with one space
        s = re.sub(r"\s+", ' ', s)
        # replace digits with no space
        s = re.sub(r"\d", '', s)

        return s
    words = [preprocess(word) for word in text.lower().split()[:max_length]]
    word_indices = [glove.stoi[word] if word in glove.stoi else 0 for word in words]

    if len(word_indices) < max_length:
        word_indices.extend([0] * (max_length - len(word_indices)))

    inputs = torch.tensor(word_indices).unsqueeze(0).to(device)

    batch_size = inputs.size(0)
    h = model.init_hidden(batch_size, device)
    h = tuple([each.data for each in h])

    model.eval()
    with torch.no_grad():
        output, h = model(inputs, h)
        prediction = torch.softmax(output, dim=1).cpu().numpy()

    label_mapping = {2: 'Positive', 1: 'Neutral', 0: 'Negative'}
    predicted_class = np.argmax(prediction)
    predicted_probabilities = prediction[0][np.argmax(prediction)]

    return predicted_class, predicted_probabilities

.vector_cache/glove.6B.zip: 862MB [02:38, 5.42MB/s]                           
100%|█████████▉| 399999/400000 [00:19<00:00, 20885.38it/s]


In [34]:
data = combine.copy()

In [35]:
data

Unnamed: 0,datetime,headline,Close,RSI,MACD_12_26_9,MACDs_12_26_9,MACDh_12_26_9,y
0,2024-08-02 03:18:47,Apple Inc. (NASDAQ:AAPL) Shares Sold by Capita...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
1,2024-08-02 03:18:47,Apple Inc. (NASDAQ:AAPL) Shares Sold by Eos Ma...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
2,2024-08-02 03:18:47,Investments & Financial Planning LLC Increases...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
3,2024-08-02 03:18:48,Apple Inc. (NASDAQ:AAPL) Shares Sold by MontVu...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
4,2024-08-02 03:18:49,Apple Inc. (NASDAQ:AAPL) Shares Bought by Paul...,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
...,...,...,...,...,...,...,...,...
340,2024-08-12 14:05:20,Dow Jones Falls 200 Points As Inflation Report...,216.619995,36.015020,-0.309908,-0.233355,-0.076553,1
341,2024-08-12 14:23:26,AI Could Help the iPhone 16 Drive an Apple Ren...,216.615005,37.877469,-0.303865,-0.265764,-0.038101,1
342,2024-08-12 16:09:00,"Big Tech earnings proved the AI hype is real, ...",217.550003,60.537040,-0.043584,-0.082543,0.038959,1
343,2024-08-12 18:50:00,"I inherited $50,000. Is there a single stock I...",217.550003,60.537040,-0.043584,-0.082543,0.038959,1


In [36]:
label_mapping = {'Positive': 2, 'Neutral': 1, 'Negative': 0}

In [37]:
data['headline'] = data['headline'].astype(str).apply(predict_text, args=(sentiment_analyzer, 15))
data['headline'] = data['headline'].apply(lambda x: x[0])
data['headline'] = data['headline'].map(label_mapping)
data

Unnamed: 0,datetime,headline,Close,RSI,MACD_12_26_9,MACDs_12_26_9,MACDh_12_26_9,y
0,2024-08-02 03:18:47,0,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
1,2024-08-02 03:18:47,0,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
2,2024-08-02 03:18:47,2,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
3,2024-08-02 03:18:48,0,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
4,2024-08-02 03:18:49,1,218.279999,52.913267,-0.034663,-0.143003,0.108340,1
...,...,...,...,...,...,...,...,...
340,2024-08-12 14:05:20,0,216.619995,36.015020,-0.309908,-0.233355,-0.076553,1
341,2024-08-12 14:23:26,2,216.615005,37.877469,-0.303865,-0.265764,-0.038101,1
342,2024-08-12 16:09:00,2,217.550003,60.537040,-0.043584,-0.082543,0.038959,1
343,2024-08-12 18:50:00,1,217.550003,60.537040,-0.043584,-0.082543,0.038959,1


In [39]:
X = data.drop(['y', 'datetime'], axis=1)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
y_train

Unnamed: 0,y
66,0
229,1
7,1
140,0
324,1
...,...
188,0
71,0
106,0
270,1


In [41]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [43]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.927536231884058
              precision    recall  f1-score   support

           0       0.95      0.92      0.94        39
           1       0.90      0.93      0.92        30

    accuracy                           0.93        69
   macro avg       0.93      0.93      0.93        69
weighted avg       0.93      0.93      0.93        69



In [44]:
import pickle
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf, f)