In [19]:
#import packages
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from typing import List
from sklearn.model_selection import train_test_split
import numpy as np

# If you haven't already, you'll need to download these resources.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#import data
prediction_df = pd.read_csv('../data/prediction_data_aapl.csv')

In [3]:
#preprocess dataframe such that the text of column "content" is a list of strings instead of normal letters.

def preprocess_dataframe_content(df: pd.DataFrame) -> List[List[str]]:
    """
    Preprocesses each entry in the 'content' column of the given DataFrame by:
    - Lowercasing
    - Keeping only alphabetic characters
    - Removing stopwords
    - Lemmatizing
    - Filtering out words with length less than 3
    Tokenizes the preprocessed entry into a list of words.
    Returns a list of lists where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.

    Returns:
    - list: A list of lists, where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.
    """
    # Prepare lemmatizer and stopwords list
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess the content
    processed_content = []
    for content in df['content']:
        # Keep only alphabetic characters and lowercased
        tokens = re.sub('[^a-zA-Z\s]', '', content.lower().strip()).split()
        # Remove stopwords and short words, and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) >= 3]
        processed_content.append(tokens)

    return processed_content


In [4]:
#preprocess the prediction data
prep_pred_data = preprocess_dataframe_content(prediction_df)

In [13]:
#split training and testing data
train_data, test_data = train_test_split(prep_pred_data, test_size=0.2, random_state=42)

In [24]:
#Given a context, predict a word --> kun je doen met context naar increase/decrease maar lijkt me ver gezocht
cbow_model = Word2Vec(sentences = train_data,
                      vector_size = 100,
                      window = 5,
                      min_count = 1,
                      sg = 0) #sg=0 means Cbow, sg=1 means skipgram

In [17]:
#Given a word, predict the context --> given apple, predict what the news will say
skip_model = Word2Vec(sentences = train_data,
                      vector_size = 100,
                      window = 5,
                      min_count = 1,
                      sg = 1) #sg=0 means Cbow, sg=1 means skipgram

In [21]:
document_vectors = []

# Iterate through each document in your dataset
for document in prep_pred_data:
    # Initialize an empty vector for the document
    doc_vector = np.zeros(skip_model.vector_size)
    num_words = 0
    for word in document:
        if word in skip_model.wv:
            doc_vector += skip_model.wv[word]
            num_words += 1
    if num_words > 0:
        doc_vector /= num_words  # Take the average of word vectors in the document
    document_vectors.append(doc_vector)

In [25]:
skip_model.save("skipgram_model")
cbow_model.save("cbow_model")

In [8]:
print(skip_model.wv.most_similar("apple", topn = 10))
print(skip_model.wv.most_similar(positive=["apple"], negative = ['samsung'], topn = 10))

[('aapl', 0.6504042744636536), ('appleapple', 0.6361054182052612), ('sliver', 0.5944479703903198), ('ascendance', 0.5875864624977112), ('megacaps', 0.5815182328224182), ('appstore', 0.5808985233306885), ('ycharts', 0.577272355556488), ('briskly', 0.5772132277488708), ('unsavory', 0.5762864351272583), ('outset', 0.5758554935455322)]
[('aapl', 0.29290643334388733), ('insane', 0.24172966182231903), ('beast', 0.2255399227142334), ('ntnx', 0.2240850329399109), ('ihrt', 0.22397536039352417), ('grub', 0.22346189618110657), ('homework', 0.2212762087583542), ('labeled', 0.22048817574977875), ('rundid', 0.2199888527393341), ('afterhours', 0.2199639081954956)]


In [9]:
# embeddings_cbow = cbow_model.wv.vectors

In [23]:
document_vectors

[array([-0.21928381,  0.25845378, -0.00883241,  0.23867136,  0.11453084,
        -0.43634567,  0.2553025 ,  0.51959824, -0.39846763, -0.24227845,
        -0.3647916 , -0.44972462, -0.02763157,  0.14638367, -0.03822808,
        -0.0641061 , -0.06952988, -0.36282954, -0.18505616, -0.58905398,
         0.02388724,  0.18331388,  0.21231114,  0.09121317, -0.31665141,
         0.26943737, -0.29041156, -0.28799318, -0.21965028,  0.12664942,
         0.26775294, -0.0950312 ,  0.03812185, -0.25591418, -0.11740786,
         0.43293686,  0.13676709, -0.10697007, -0.26555406, -0.42788283,
         0.19778611, -0.00410906,  0.0958795 , -0.16982372,  0.24021791,
         0.08083994, -0.02381387,  0.00544007,  0.23985503,  0.31690703,
         0.09203141, -0.28809789, -0.12837397,  0.01757948, -0.43571526,
         0.26943335,  0.16930696, -0.17411237, -0.17299872,  0.0155137 ,
         0.192138  , -0.13193823, -0.00787482, -0.25750712, -0.3439882 ,
         0.16168323,  0.02364056,  0.17725136, -0.1