In [7]:
#import packages
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from typing import List

# If you haven't already, you'll need to download these resources.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
#import data
prediction_df = pd.read_csv('../data/prediction_data_aapl.csv')

In [6]:
#preprocess dataframe such that the text of column "content" is a list of strings instead of normal letters.

def preprocess_dataframe_content(df: pd.DataFrame) -> List[List[str]]:
    """
    Preprocesses each entry in the 'content' column of the given DataFrame by:
    - Lowercasing
    - Keeping only alphabetic characters
    - Removing stopwords
    - Lemmatizing
    - Filtering out words with length less than 3
    Tokenizes the preprocessed entry into a list of words.
    Returns a list of lists where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.

    Returns:
    - list: A list of lists, where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.
    """
    # Prepare lemmatizer and stopwords list
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess the content
    processed_content = []
    for content in df['content']:
        # Keep only alphabetic characters and lowercased
        tokens = re.sub('[^a-zA-Z\s]', '', content.lower().strip()).split()
        # Remove stopwords and short words, and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) >= 3]
        processed_content.append(tokens)

    return processed_content


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
#preprocess the prediction data
prep_pred_data = preprocess_dataframe_content(prediction_df)

In [10]:
#Given a context, predict a word --> kun je doen met context naar increase/decrease maar lijkt me ver gezocht
cbow_model = Word2Vec(sentences = prep_pred_data,
                      vector_size = 500,
                      window = 5,
                      min_count = 1,
                      sg = 0) #sg=0 means Cbow, sg=1 means skipgram

In [11]:
#Given a word, predict the context --> given apple, predict what the news will say
skip_model = Word2Vec(sentences = prep_pred_data,
                      vector_size = 500,
                      window = 5,
                      min_count = 1,
                      sg = 1) #sg=0 means Cbow, sg=1 means skipgram

In [20]:
print(cbow_model.wv.most_similar("iphone", topn = 10))
print(cbow_model.wv.most_similar(positive=["apple"], negative = ['samsung'], topn = 10))

[('iphones', 0.6208415627479553), ('handset', 0.5964974761009216), ('smartphone', 0.5851163268089294), ('ipad', 0.5786470174789429), ('gadget', 0.5411442518234253), ('phone', 0.526934027671814), ('smartwatch', 0.5150953531265259), ('smartphones', 0.5022557377815247), ('galaxy', 0.49244457483291626), ('mac', 0.48796501755714417)]
[('moregood', 0.3535393476486206), ('trampling', 0.30653002858161926), ('gainshares', 0.296678751707077), ('facebook', 0.2835407555103302), ('considerurban', 0.27017369866371155), ('considerworkday', 0.26064935326576233), ('verge', 0.2545115053653717), ('nflx', 0.2507176697254181), ('foramerican', 0.24650248885154724), ('explosion', 0.240297332406044)]


In [21]:
print(skip_model.wv.most_similar("apple", topn = 10))
print(skip_model.wv.most_similar(positive=["apple"], negative = ['samsung'], topn = 10))

[('aapl', 0.6430505514144897), ('briskly', 0.6250367760658264), ('ascendance', 0.6167294979095459), ('appleapple', 0.6146665811538696), ('reignite', 0.5957915186882019), ('sliver', 0.5943598747253418), ('weathered', 0.5898398756980896), ('cannibalized', 0.5845192670822144), ('advanded', 0.5828044414520264), ('nkeheadquartered', 0.5826593637466431)]
[('aapl', 0.29989495873451233), ('grub', 0.23178832232952118), ('biggie', 0.2272990494966507), ('insane', 0.2268953174352646), ('ignoring', 0.22467628121376038), ('ihrt', 0.2210230827331543), ('beast', 0.2204151749610901), ('bleed', 0.21781058609485626), ('judging', 0.21624860167503357), ('absurdly', 0.21575164794921875)]


In [22]:
embeddings_cbow = cbow_model.wv.vectors


array([[-1.3813889e+00, -8.2837516e-01, -4.9135578e-01, ...,
         9.5497245e-01,  1.3172935e+00,  9.5718837e-01],
       [-8.4077632e-01,  1.3270417e-01, -7.1590400e-01, ...,
         1.9882495e-02,  6.4561683e-01,  3.5624686e-01],
       [-3.1210819e-01, -1.3658292e+00,  2.1714839e-01, ...,
         7.0638359e-01,  1.5217949e-01,  7.7731621e-01],
       ...,
       [ 3.4957996e-03,  1.0397855e-02,  2.2640338e-03, ...,
        -1.4712235e-02, -1.5349389e-02,  5.7460245e-04],
       [ 5.9544896e-03,  4.7844159e-03,  3.3711873e-03, ...,
         5.7485951e-03, -7.9359729e-03,  9.6770925e-03],
       [ 9.7475816e-03, -1.0345847e-02, -6.2139700e-03, ...,
         4.5812638e-03, -8.1861659e-04,  5.8980617e-03]], dtype=float32)