In [1]:
#import packages
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from typing import List
from sklearn.model_selection import train_test_split
import numpy as np

# If you haven't already, you'll need to download these resources.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#import data
prediction_df = pd.read_csv('../data/prediction_data_aapl.csv')

In [3]:
#preprocess dataframe such that the text of column "content" is a list of strings instead of normal letters.

def preprocess_dataframe_content(df: pd.DataFrame) -> List[List[str]]:
    """
    Preprocesses each entry in the 'content' column of the given DataFrame by:
    - Lowercasing
    - Keeping only alphabetic characters
    - Removing stopwords
    - Lemmatizing
    - Filtering out words with length less than 3
    Tokenizes the preprocessed entry into a list of words.
    Returns a list of lists where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.

    Returns:
    - list: A list of lists, where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.
    """
    # Prepare lemmatizer and stopwords list
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess the content
    processed_content = []
    for content in df['content']:
        # Keep only alphabetic characters and lowercased
        tokens = re.sub('[^a-zA-Z\s]', '', content.lower().strip()).split()
        # Remove stopwords and short words, and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) >= 3]
        processed_content.append(tokens)

    return processed_content


In [4]:
#preprocess the prediction data
prep_pred_data = preprocess_dataframe_content(prediction_df)

## Create Cbow and Skip gram with vectors = 100

In [5]:
#Given a context, predict a word --> kun je doen met context naar increase/decrease maar lijkt me ver gezocht
cbow_model = Word2Vec(sentences = prep_pred_data,
                      vector_size = 100,
                      window = 5,
                      min_count = 1,
                      sg = 0) #sg=0 means Cbow, sg=1 means skipgram

In [6]:
#Given a word, predict the context --> given apple, predict what the news will say
skip_model = Word2Vec(sentences = prep_pred_data,
                      vector_size = 100,
                      window = 5,
                      min_count = 1,
                      sg = 1) #sg=0 means Cbow, sg=1 means skipgram

In [7]:
skip_model.save("skip_model")
cbow_model.save("cbow_model")

## Create Cbow and Skip gram with vectors = 300

In [8]:
#Given a context, predict a word --> kun je doen met context naar increase/decrease maar lijkt me ver gezocht
cbow_model_300 = Word2Vec(sentences = prep_pred_data,
                      vector_size = 300,
                      window = 5,
                      min_count = 1,
                      sg = 0)  #sg=0 means Cbow, sg=1 means skipgram

#Given a word, predict the context --> given apple, predict what the news will say
skip_model_300 = Word2Vec(sentences = prep_pred_data,
                      vector_size = 300,
                      window = 5,
                      min_count = 1,
                      sg = 1)  #sg=0 means Cbow, sg=1 means skipgram

skip_model_300.save("skip_model_300")
cbow_model_300.save("cbow_model_300")

## Create cbow and skip gram with min_count 2

In [9]:
#min_count is the minimum amount of times a word has to appear to be considered in the data
cbow_model_2min = Word2Vec(sentences = prep_pred_data,
                      vector_size = 300,
                      window = 5,
                      min_count = 2,
                      sg = 0)  #sg=0 means Cbow, sg=1 means skipgram

#skipgram
skip_model_2min = Word2Vec(sentences = prep_pred_data,
                      vector_size = 300,
                      window = 5,
                      min_count = 2,
                      sg = 1)  #sg=0 means Cbow, sg=1 means skipgram

skip_model_2min.save("skip_model_2min")
cbow_model_2min.save("cbow_model_2min")

In [10]:
prediction_df.head()

Unnamed: 0.1,Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Date,stock_increase
0,49181,270698,AAPL,JPMorgan cautious ahead of Apple earnings,news,jpmorgan lift apple aapl target ahead tomorrow...,2020-01-28,Seeking Alpha,https://invst.ly/pnjv8,2068762,2020-01-28,1.0
1,49182,270699,AAPL,FAANG s Fall but Get Some Wall Street Love,news,kim khan investing com faang stock predictably...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068765,2020-01-28,1.0
2,49183,270700,AAPL,Wall Street tumbles as virus fuels economic worry,news,chuck mikolajczak new york reuters stock suffe...,2020-01-28,Reuters,https://www.investing.com/news/stock-market-ne...,2068311,2020-01-28,1.0
3,49184,270701,AAPL,Earnings Watch Apple and AMD to take earnings...,news,two best performing tech stock set report resu...,2020-01-28,MarketWatch,https://invst.ly/pnlbs,2068906,2020-01-28,1.0
4,49185,270702,AAPL,Day Ahead Top 3 Things to Watch for Jan 28,news,yasin ebrahim kim khan apple ready earnings in...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068907,2020-01-28,1.0


In [11]:
# Method 1: Using the 'unique' method
unique_values = prediction_df['provider'].unique()
unique_values

array(['Seeking Alpha', 'Investing.com', 'Reuters', 'MarketWatch', 'CNBC',
       'Zacks Investment Research', 'Brian Gilmartin', '247wallst',
       'Michael Kramer', 'Haris Anwar/Investing.com', 'Keith Fitz-Gerald',
       'Ed Moya', 'Fiona Cincotta', 'The Motley Fool',
       'Jesse Cohen/Investing.com', 'Marc Chandler', 'Lance Roberts',
       'Bloomberg', 'Cointelegraph', 'Jeffrey Halley', 'Tim Knight',
       'Charles Hugh Smith', 'Jani Ziedins', 'TipRanks', 'Brett Owens',
       'Michael Foster', 'Boris Schlossberg',
       'Pinchas Cohen/Investing.com', 'Bitcoinist', 'Craig Erlam',
       'Kathy Lien', 'JJ Kinahan', 'Jay Kaeppel', 'Michael Pento',
       'Ryan Mallory', 'Candy Matheson', 'David I. Kranzler',
       'Stephen Innes', 'Hale Stewart', 'Harry Dent', 'Matt Simpson',
       'Barani Krishnan/Investing.com', 'Matthew Weller', 'Adam Hamilton',
       'Thinknum', 'Juan Maldonado', 'Dan Flynn', 'London Capital Group',
       'Stephen McBride', 'Alexander Kuptsikevich', 'An

In [12]:
# Name you want to check for
name_to_check = "Bloomberg"

# Check if the name is in the unique list
if name_to_check in unique_values:
    print(f"{name_to_check} is in the unique list.")
else:
    print(f"{name_to_check} is not in the unique list.")

Bloomberg is in the unique list.
