In [9]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from typing import List
from sklearn.model_selection import train_test_split
import numpy as np

# If you haven't already, you'll need to download
# these resources.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/levyvankempen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/levyvankempen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
skip_model = Word2Vec.load("../models/skipgram_model")

In [4]:
prediction_df = pd.read_csv('../data/prediction_data_aapl.csv')

In [10]:
#preprocess dataframe such that the text of column "content" is a list of strings instead of normal letters.

def preprocess_dataframe_content(df: pd.DataFrame) -> List[List[str]]:
    """
    Preprocesses each entry in the 'content' column of the given DataFrame by:
    - Lowercasing
    - Keeping only alphabetic characters
    - Removing stopwords
    - Lemmatizing
    - Filtering out words with length less than 3
    Tokenizes the preprocessed entry into a list of words.
    Returns a list of lists where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.

    Returns:
    - list: A list of lists, where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.
    """
    # Prepare lemmatizer and stopwords list
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess the content
    processed_content = []
    for content in df['content']:
        # Keep only alphabetic characters and lowercased
        tokens = re.sub('[^a-zA-Z\s]', '', content.lower().strip()).split()
        # Remove stopwords and short words, and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) >= 3]
        processed_content.append(tokens)

    return processed_content


In [11]:
#preprocess the prediction data
prep_pred_data = preprocess_dataframe_content(prediction_df)

In [12]:
#split training and testing data
train_data, test_data = train_test_split(prep_pred_data, test_size=0.2, random_state=42)

In [13]:
document_vectors = []

# Iterate through each document in your dataset
for document in prep_pred_data:
    # Initialize an empty vector for the document
    doc_vector = np.zeros(skip_model.vector_size)
    num_words = 0
    for word in document:
        if word in skip_model.wv:
            doc_vector += skip_model.wv[word]
            num_words += 1
    if num_words > 0:
        doc_vector /= num_words  # Take the average of word vectors in the document
    document_vectors.append(doc_vector)

In [17]:
document_vectors_dataframe = pd.DataFrame(document_vectors)

In [18]:
document_vectors_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.219284,0.258454,-0.008832,0.238671,0.114531,-0.436346,0.255302,0.519598,-0.398468,-0.242278,...,0.173818,0.032343,-0.032013,0.124080,0.351752,0.186652,0.265988,-0.098836,-0.033528,-0.049972
1,-0.170654,0.328163,-0.072513,0.078444,0.158198,-0.460393,0.215663,0.603253,-0.292452,-0.217356,...,0.176265,0.068080,-0.082649,0.051679,0.269260,0.038175,0.310259,-0.109412,-0.085126,-0.128862
2,-0.260273,0.221014,0.049317,0.140053,0.130579,-0.379285,0.205510,0.604875,-0.148124,-0.185212,...,0.236150,0.172791,0.043126,0.079747,0.350922,0.087955,0.199898,-0.060968,-0.070270,-0.179191
3,-0.224536,0.190621,-0.055414,0.174733,0.137548,-0.390338,0.212342,0.610712,-0.231648,-0.167329,...,0.168009,0.114189,-0.021752,-0.006575,0.361853,0.123401,0.229562,-0.063109,-0.043319,-0.066563
4,-0.222508,0.217237,-0.038972,0.191601,0.107342,-0.414207,0.234992,0.570198,-0.258476,-0.192762,...,0.195316,0.068823,-0.023486,0.035196,0.389060,0.106186,0.209795,-0.028456,-0.015501,-0.065398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18067,-0.308249,0.206434,-0.027573,0.162114,0.101535,-0.354843,0.104915,0.575371,-0.298096,-0.124537,...,0.346540,0.154407,0.014932,0.074463,0.347513,0.208616,0.168755,-0.065004,-0.051067,-0.126014
18068,-0.268064,0.172437,0.024467,0.118344,0.107451,-0.374387,0.146095,0.537757,-0.258261,-0.179940,...,0.281491,0.190843,0.017922,0.158135,0.351252,0.140699,0.176476,-0.080134,-0.053800,-0.121402
18069,-0.263031,0.203723,0.040793,0.165175,0.094010,-0.385438,0.161673,0.553989,-0.247641,-0.088284,...,0.247675,0.191282,0.047966,0.098987,0.344716,0.146527,0.224284,0.003793,-0.143006,-0.180063
18070,-0.359413,0.129657,0.160584,0.211500,0.128659,-0.513409,0.244522,0.515716,-0.260017,-0.261848,...,0.202842,0.078281,-0.066273,0.236228,0.373703,0.064427,0.214603,0.014857,-0.024403,-0.019017


In [19]:
prediction_df

Unnamed: 0.1,Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Date,stock_increase
0,49181,270698,AAPL,JPMorgan cautious ahead of Apple earnings,news,jpmorgan lift apple aapl target ahead tomorrow...,2020-01-28,Seeking Alpha,https://invst.ly/pnjv8,2068762,2020-01-28,1.0
1,49182,270699,AAPL,FAANG s Fall but Get Some Wall Street Love,news,kim khan investing com faang stock predictably...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068765,2020-01-28,1.0
2,49183,270700,AAPL,Wall Street tumbles as virus fuels economic worry,news,chuck mikolajczak new york reuters stock suffe...,2020-01-28,Reuters,https://www.investing.com/news/stock-market-ne...,2068311,2020-01-28,1.0
3,49184,270701,AAPL,Earnings Watch Apple and AMD to take earnings...,news,two best performing tech stock set report resu...,2020-01-28,MarketWatch,https://invst.ly/pnlbs,2068906,2020-01-28,1.0
4,49185,270702,AAPL,Day Ahead Top 3 Things to Watch for Jan 28,news,yasin ebrahim kim khan apple ready earnings in...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068907,2020-01-28,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
18067,69407,290924,AAPL,Waiting For Direction On The Markets,opinion,stock market difficult one trader investor ali...,2012-07-16,Cam Hui,https://www.investing.com/analysis/waiting-for...,129680,2012-07-16,1.0
18068,69408,290925,AAPL,Mid Year Update U S And Canadian Stock Marke...,opinion,tsx index leading canadian stock outperformed ...,2012-07-19,Baskin Financial Blog,https://www.investing.com/analysis/mid-year-up...,130056,2012-07-19,1.0
18069,69409,290926,AAPL,Summer Heat Scorches Europe And U S,opinion,europe flare summer heat continues summer heat...,2012-07-23,John Nyaradi,https://www.investing.com/analysis/summer-heat...,130439,2012-07-23,1.0
18070,69410,290927,AAPL,Apple Earnings Preview Quarterly Dip On Deck,opinion,last quarter apple aapl reported best quarter ...,2012-07-23,David Dyer,https://www.investing.com/analysis/apple-earni...,130458,2012-07-23,1.0


In [20]:
# Keep only the 'stock_increase' column from prediction_df
prediction_df = prediction_df[['stock_increase']]

# Merge the two DataFrames side by side
merged_df = pd.concat([prediction_df, document_vectors_dataframe], axis=1)

In [21]:
merged_df

Unnamed: 0,stock_increase,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,1.0,-0.219284,0.258454,-0.008832,0.238671,0.114531,-0.436346,0.255302,0.519598,-0.398468,...,0.173818,0.032343,-0.032013,0.124080,0.351752,0.186652,0.265988,-0.098836,-0.033528,-0.049972
1,1.0,-0.170654,0.328163,-0.072513,0.078444,0.158198,-0.460393,0.215663,0.603253,-0.292452,...,0.176265,0.068080,-0.082649,0.051679,0.269260,0.038175,0.310259,-0.109412,-0.085126,-0.128862
2,1.0,-0.260273,0.221014,0.049317,0.140053,0.130579,-0.379285,0.205510,0.604875,-0.148124,...,0.236150,0.172791,0.043126,0.079747,0.350922,0.087955,0.199898,-0.060968,-0.070270,-0.179191
3,1.0,-0.224536,0.190621,-0.055414,0.174733,0.137548,-0.390338,0.212342,0.610712,-0.231648,...,0.168009,0.114189,-0.021752,-0.006575,0.361853,0.123401,0.229562,-0.063109,-0.043319,-0.066563
4,1.0,-0.222508,0.217237,-0.038972,0.191601,0.107342,-0.414207,0.234992,0.570198,-0.258476,...,0.195316,0.068823,-0.023486,0.035196,0.389060,0.106186,0.209795,-0.028456,-0.015501,-0.065398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18067,1.0,-0.308249,0.206434,-0.027573,0.162114,0.101535,-0.354843,0.104915,0.575371,-0.298096,...,0.346540,0.154407,0.014932,0.074463,0.347513,0.208616,0.168755,-0.065004,-0.051067,-0.126014
18068,1.0,-0.268064,0.172437,0.024467,0.118344,0.107451,-0.374387,0.146095,0.537757,-0.258261,...,0.281491,0.190843,0.017922,0.158135,0.351252,0.140699,0.176476,-0.080134,-0.053800,-0.121402
18069,1.0,-0.263031,0.203723,0.040793,0.165175,0.094010,-0.385438,0.161673,0.553989,-0.247641,...,0.247675,0.191282,0.047966,0.098987,0.344716,0.146527,0.224284,0.003793,-0.143006,-0.180063
18070,1.0,-0.359413,0.129657,0.160584,0.211500,0.128659,-0.513409,0.244522,0.515716,-0.260017,...,0.202842,0.078281,-0.066273,0.236228,0.373703,0.064427,0.214603,0.014857,-0.024403,-0.019017


In [22]:
from sklearn.model_selection import train_test_split

# Features are all columns except 'stock_increase', and target is 'stock_increase'
X = merged_df.drop(columns=['stock_increase'])
y = merged_df['stock_increase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report


In [25]:
clf = GaussianNB()
clf.fit(X_train, y_train)


In [26]:
y_pred = clf.predict(X_test)


In [27]:
# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 50.71%
              precision    recall  f1-score   support

         0.0       0.48      0.58      0.52      1705
         1.0       0.54      0.45      0.49      1910

    accuracy                           0.51      3615
   macro avg       0.51      0.51      0.51      3615
weighted avg       0.51      0.51      0.51      3615



In [28]:
merged_df["stock_increase"].value_counts()

1.0    9645
0.0    8427
Name: stock_increase, dtype: int64

In [29]:
y_train.value_counts()

1.0    7735
0.0    6722
Name: stock_increase, dtype: int64

In [30]:
y_test.value_counts()

1.0    1910
0.0    1705
Name: stock_increase, dtype: int64

In [31]:
from sklearn.model_selection import cross_val_score

gnb = GaussianNB()
scores = cross_val_score(gnb, X_train, y_train, cv=5, scoring='accuracy')

print(f"Cross-validated scores: {scores}")
print(f"Mean CV Accuracy: {scores.mean()} +/- {scores.std()}")

# Train on the full training set and test
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Cross-validated scores: [0.50760719 0.53596127 0.50778277 0.50397786 0.50224836]
Mean CV Accuracy: 0.511515491631634 +/- 0.012405642469305924
0.5070539419087137
              precision    recall  f1-score   support

         0.0       0.48      0.58      0.52      1705
         1.0       0.54      0.45      0.49      1910

    accuracy                           0.51      3615
   macro avg       0.51      0.51      0.51      3615
weighted avg       0.51      0.51      0.51      3615



In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split

# Splitting the data
X = merged_df.drop('stock_increase', axis=1)
y = merged_df['stock_increase']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest with Cross-Validation
rf = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')

print(f"Cross-validated scores: {scores}")
print(f"Mean CV Accuracy: {scores.mean()} +/- {scores.std()}")

# Train on the full training set and test
rf.fit(X_train, y_train)
y_pred


Cross-validated scores: [0.52835408 0.5335408  0.5499827  0.53095815 0.53303355]
Mean CV Accuracy: 0.5351738571509903 +/- 0.00762668111301023


array([1., 0., 1., ..., 0., 0., 1.])

In [33]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.5070539419087137
              precision    recall  f1-score   support

         0.0       0.48      0.58      0.52      1705
         1.0       0.54      0.45      0.49      1910

    accuracy                           0.51      3615
   macro avg       0.51      0.51      0.51      3615
weighted avg       0.51      0.51      0.51      3615



In [34]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Create the pipeline
pipeline = Pipeline([
    ('rf', RandomForestClassifier())
])


In [35]:
from scipy.stats import randint

param_dist = {
    'rf__n_estimators': randint(10, 200),   # Number of trees in the forest
    'rf__max_features': ['auto', 'sqrt', 'log2'],  # Number of features for best split
    'rf__max_depth': randint(1, 20),   # Depth of the tree
    'rf__min_samples_split': randint(2, 10),   # Minimum number of samples to split an internal node
    'rf__min_samples_leaf': randint(1, 10),   # Minimum number of samples required to be at a leaf node
    'rf__bootstrap': [True, False],  # Method of selecting samples for training each tree
    'rf__class_weight': ['balanced', None]   # Weights associated with classes
}


In [36]:
from sklearn.model_selection import RandomizedSearchCV

# Use the random grid to search for best hyperparameters
rf_random = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [37]:
best_model = rf_random.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Test Accuracy: 0.5278008298755187
