In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from typing import List
from sklearn.model_selection import train_test_split
import numpy as np

# If you haven't already, you'll need to download
# these resources.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
skip_model = Word2Vec.load("../models/cbow_model_2min")

In [3]:
prediction_df = pd.read_csv('../data/prediction_data_aapl.csv')

In [4]:
#preprocess dataframe such that the text of column "content" is a list of strings instead of normal letters.

def preprocess_dataframe_content(df: pd.DataFrame) -> List[List[str]]:
    """
    Preprocesses each entry in the 'content' column of the given DataFrame by:
    - Lowercasing
    - Keeping only alphabetic characters
    - Removing stopwords
    - Lemmatizing
    - Filtering out words with length less than 3
    Tokenizes the preprocessed entry into a list of words.
    Returns a list of lists where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.

    Returns:
    - list: A list of lists, where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.
    """
    # Prepare lemmatizer and stopwords list
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess the content
    processed_content = []
    for content in df['content']:
        # Keep only alphabetic characters and lowercased
        tokens = re.sub('[^a-zA-Z\s]', '', content.lower().strip()).split()
        # Remove stopwords and short words, and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) >= 3]
        processed_content.append(tokens)

    return processed_content


In [5]:
#preprocess the prediction data
prep_pred_data = preprocess_dataframe_content(prediction_df)

In [6]:
#split training and testing data
train_data, test_data = train_test_split(prep_pred_data, test_size=0.2, random_state=42)

In [7]:
document_vectors = []

# Iterate through each document in your dataset
for document in prep_pred_data:
    # Initialize an empty vector for the document
    doc_vector = np.zeros(skip_model.vector_size)
    num_words = 0
    for word in document:
        if word in skip_model.wv:
            doc_vector += skip_model.wv[word]
            num_words += 1
    if num_words > 0:
        doc_vector /= num_words  # Take the average of word vectors in the document
    document_vectors.append(doc_vector)

In [8]:
document_vectors_dataframe = pd.DataFrame(document_vectors)

In [9]:
document_vectors_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.168224,-0.495679,0.324030,-0.242480,0.184591,0.374841,-0.036248,-0.332221,-0.599187,0.368172,...,0.116010,-0.646997,-0.111996,-0.299829,-0.443257,-0.242020,0.841042,-0.073041,-0.109485,-0.098358
1,-0.012605,-0.394349,0.097361,-0.225669,-0.199009,0.392812,-0.153641,0.342992,-0.346285,0.136275,...,0.252347,-0.564153,-0.362388,-0.179683,-0.026317,0.270566,0.617830,0.184016,-0.020996,-0.073160
2,-0.047816,-0.060572,0.057269,-0.000904,-0.292328,0.483402,-0.015738,0.226332,-0.287929,0.094340,...,0.144032,-0.367153,-0.108658,-0.241499,-0.246777,0.256733,0.619212,-0.057776,0.384899,-0.119208
3,-0.040624,-0.168761,0.058535,0.057410,-0.100838,0.292741,-0.004191,0.043423,-0.328297,0.279242,...,0.188750,-0.409938,-0.135549,-0.145407,-0.212148,0.148339,0.432560,-0.214075,0.019210,-0.120047
4,0.089348,-0.326622,0.131642,-0.150949,-0.131480,0.549980,0.038805,-0.056795,-0.331820,0.276895,...,0.125006,-0.406472,-0.159782,-0.189099,-0.325308,0.047118,0.610220,-0.052178,0.079585,-0.173855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18067,-0.095696,-0.171368,0.149107,-0.164140,-0.111491,0.243613,-0.036502,0.238864,-0.068185,0.404670,...,0.256503,-0.290045,-0.146078,-0.302595,-0.104610,0.118958,0.441293,0.070423,0.260583,0.016124
18068,-0.002159,-0.024052,-0.030515,0.008726,-0.373765,0.427209,-0.054720,0.070499,-0.352737,0.225969,...,0.260338,-0.230413,-0.302271,-0.414318,-0.309454,0.150445,0.448909,0.129060,0.384882,-0.178881
18069,0.001782,-0.150355,0.250220,-0.084058,-0.265737,0.535945,0.014501,-0.003547,-0.236052,0.155300,...,0.090240,-0.288605,-0.130172,-0.184711,-0.374830,0.127884,0.660305,-0.004154,0.464820,-0.106518
18070,0.287233,-0.275369,0.172296,0.050259,-0.349938,0.312524,0.019061,-0.355033,-0.593154,0.604696,...,0.115085,-0.467677,-0.249983,-0.779245,-0.383697,0.020691,0.649797,-0.141340,0.126768,-0.051663


In [10]:
prediction_df

Unnamed: 0.1,Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Date,stock_increase
0,49181,270698,AAPL,JPMorgan cautious ahead of Apple earnings,news,jpmorgan lift apple aapl target ahead tomorrow...,2020-01-28,Seeking Alpha,https://invst.ly/pnjv8,2068762,2020-01-28,1.0
1,49182,270699,AAPL,FAANG s Fall but Get Some Wall Street Love,news,kim khan investing com faang stock predictably...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068765,2020-01-28,1.0
2,49183,270700,AAPL,Wall Street tumbles as virus fuels economic worry,news,chuck mikolajczak new york reuters stock suffe...,2020-01-28,Reuters,https://www.investing.com/news/stock-market-ne...,2068311,2020-01-28,1.0
3,49184,270701,AAPL,Earnings Watch Apple and AMD to take earnings...,news,two best performing tech stock set report resu...,2020-01-28,MarketWatch,https://invst.ly/pnlbs,2068906,2020-01-28,1.0
4,49185,270702,AAPL,Day Ahead Top 3 Things to Watch for Jan 28,news,yasin ebrahim kim khan apple ready earnings in...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068907,2020-01-28,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
18067,69407,290924,AAPL,Waiting For Direction On The Markets,opinion,stock market difficult one trader investor ali...,2012-07-16,Cam Hui,https://www.investing.com/analysis/waiting-for...,129680,2012-07-16,1.0
18068,69408,290925,AAPL,Mid Year Update U S And Canadian Stock Marke...,opinion,tsx index leading canadian stock outperformed ...,2012-07-19,Baskin Financial Blog,https://www.investing.com/analysis/mid-year-up...,130056,2012-07-19,1.0
18069,69409,290926,AAPL,Summer Heat Scorches Europe And U S,opinion,europe flare summer heat continues summer heat...,2012-07-23,John Nyaradi,https://www.investing.com/analysis/summer-heat...,130439,2012-07-23,1.0
18070,69410,290927,AAPL,Apple Earnings Preview Quarterly Dip On Deck,opinion,last quarter apple aapl reported best quarter ...,2012-07-23,David Dyer,https://www.investing.com/analysis/apple-earni...,130458,2012-07-23,1.0


In [11]:
# Keep only the 'stock_increase' column from prediction_df
prediction_df = prediction_df[['stock_increase']]

# Merge the two DataFrames side by side
merged_df = pd.concat([prediction_df, document_vectors_dataframe], axis=1)

In [12]:
merged_df

Unnamed: 0,stock_increase,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
0,1.0,0.168224,-0.495679,0.324030,-0.242480,0.184591,0.374841,-0.036248,-0.332221,-0.599187,...,0.116010,-0.646997,-0.111996,-0.299829,-0.443257,-0.242020,0.841042,-0.073041,-0.109485,-0.098358
1,1.0,-0.012605,-0.394349,0.097361,-0.225669,-0.199009,0.392812,-0.153641,0.342992,-0.346285,...,0.252347,-0.564153,-0.362388,-0.179683,-0.026317,0.270566,0.617830,0.184016,-0.020996,-0.073160
2,1.0,-0.047816,-0.060572,0.057269,-0.000904,-0.292328,0.483402,-0.015738,0.226332,-0.287929,...,0.144032,-0.367153,-0.108658,-0.241499,-0.246777,0.256733,0.619212,-0.057776,0.384899,-0.119208
3,1.0,-0.040624,-0.168761,0.058535,0.057410,-0.100838,0.292741,-0.004191,0.043423,-0.328297,...,0.188750,-0.409938,-0.135549,-0.145407,-0.212148,0.148339,0.432560,-0.214075,0.019210,-0.120047
4,1.0,0.089348,-0.326622,0.131642,-0.150949,-0.131480,0.549980,0.038805,-0.056795,-0.331820,...,0.125006,-0.406472,-0.159782,-0.189099,-0.325308,0.047118,0.610220,-0.052178,0.079585,-0.173855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18067,1.0,-0.095696,-0.171368,0.149107,-0.164140,-0.111491,0.243613,-0.036502,0.238864,-0.068185,...,0.256503,-0.290045,-0.146078,-0.302595,-0.104610,0.118958,0.441293,0.070423,0.260583,0.016124
18068,1.0,-0.002159,-0.024052,-0.030515,0.008726,-0.373765,0.427209,-0.054720,0.070499,-0.352737,...,0.260338,-0.230413,-0.302271,-0.414318,-0.309454,0.150445,0.448909,0.129060,0.384882,-0.178881
18069,1.0,0.001782,-0.150355,0.250220,-0.084058,-0.265737,0.535945,0.014501,-0.003547,-0.236052,...,0.090240,-0.288605,-0.130172,-0.184711,-0.374830,0.127884,0.660305,-0.004154,0.464820,-0.106518
18070,1.0,0.287233,-0.275369,0.172296,0.050259,-0.349938,0.312524,0.019061,-0.355033,-0.593154,...,0.115085,-0.467677,-0.249983,-0.779245,-0.383697,0.020691,0.649797,-0.141340,0.126768,-0.051663


In [13]:
from sklearn.model_selection import train_test_split

# Features are all columns except 'stock_increase', and target is 'stock_increase'
X = merged_df.drop(columns=['stock_increase'])
y = merged_df['stock_increase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report


In [15]:
clf = GaussianNB()
clf.fit(X_train, y_train)


In [16]:
y_pred = clf.predict(X_test)


In [17]:
# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report
report = classification_report(y_test, y_pred)
print(report)


Accuracy: 50.57%
              precision    recall  f1-score   support

         0.0       0.48      0.62      0.54      1705
         1.0       0.54      0.40      0.46      1910

    accuracy                           0.51      3615
   macro avg       0.51      0.51      0.50      3615
weighted avg       0.51      0.51      0.50      3615



In [18]:
merged_df["stock_increase"].value_counts()

1.0    9645
0.0    8427
Name: stock_increase, dtype: int64

In [19]:
y_train.value_counts()

1.0    7735
0.0    6722
Name: stock_increase, dtype: int64

In [20]:
y_test.value_counts()

1.0    1910
0.0    1705
Name: stock_increase, dtype: int64

In [21]:
from sklearn.model_selection import cross_val_score

gnb = GaussianNB()
scores = cross_val_score(gnb, X_train, y_train, cv=5, scoring='accuracy')

print(f"Cross-validated scores: {scores}")
print(f"Mean CV Accuracy: {scores.mean()} +/- {scores.std()}")

# Train on the full training set and test
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Cross-validated scores: [0.5038036  0.52593361 0.50605327 0.50328606 0.50501557]
Mean CV Accuracy: 0.5088184201171854 +/- 0.008611538239202049
0.50567081604426
              precision    recall  f1-score   support

         0.0       0.48      0.62      0.54      1705
         1.0       0.54      0.40      0.46      1910

    accuracy                           0.51      3615
   macro avg       0.51      0.51      0.50      3615
weighted avg       0.51      0.51      0.50      3615



In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split

# Splitting the data
X = merged_df.drop('stock_increase', axis=1)
y = merged_df['stock_increase']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest with Cross-Validation
rf = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')

print(f"Cross-validated scores: {scores}")
print(f"Mean CV Accuracy: {scores.mean()} +/- {scores.std()}")

# Train on the full training set and test
rf.fit(X_train, y_train)
y_pred


Cross-validated scores: [0.54426003 0.52524205 0.52646143 0.53164995 0.53649256]
Mean CV Accuracy: 0.5328212035922041 +/- 0.006979795482774591


array([1., 0., 0., ..., 0., 0., 1.])

In [23]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.50567081604426
              precision    recall  f1-score   support

         0.0       0.48      0.62      0.54      1705
         1.0       0.54      0.40      0.46      1910

    accuracy                           0.51      3615
   macro avg       0.51      0.51      0.50      3615
weighted avg       0.51      0.51      0.50      3615



In [24]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Create the pipeline
pipeline = Pipeline([
    ('rf', RandomForestClassifier())
])


In [25]:
from scipy.stats import randint

param_dist = {
    'rf__n_estimators': randint(10, 200),   # Number of trees in the forest
    'rf__max_features': ['auto', 'sqrt', 'log2'],  # Number of features for best split
    'rf__max_depth': randint(1, 20),   # Depth of the tree
    'rf__bootstrap': [True, False],  # Method of selecting samples for training each tree
    'rf__class_weight': ['balanced', None]   # Weights associated with classes
}


In [26]:
from sklearn.model_selection import RandomizedSearchCV

# Use the random grid to search for best hyperparameters
rf_random = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, n_iter=20, cv=5, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


10 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()

In [27]:
best_model = rf_random.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Test Accuracy: 0.5278008298755187


In [28]:
best_rf = best_model.named_steps['rf']

best_n_estimators = best_rf.n_estimators
best_max_features = best_rf.max_features
best_max_depth = best_rf.max_depth
best_bootstrap = best_rf.bootstrap
best_class_weight = best_rf.class_weight

print("Best n_estimators:", best_n_estimators)
print("Best max_features:", best_max_features)
print("Best max_depth:", best_max_depth)
print("Best bootstrap:", best_bootstrap)
print("Best class_weight:", best_class_weight)

Best n_estimators: 104
Best max_features: sqrt
Best max_depth: 4
Best bootstrap: False
Best class_weight: None
