In [1]:
import pandas as pd
import numpy as np

In [2]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from typing import List

import re
import time

### We are using this notebook to explore the articles with descriptive statistics and other general information

In [3]:
# import the articles csv
articles = pd.read_csv('../data/us_equities_news_dataset.csv')

In [4]:
articles

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,Gainers NIO NYSE NIO 14 Village Farms In...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249
3,221518,NIO,NIO NVAX among premarket gainers,news,Cemtrex NASDAQ CETX 85 after FY results \n...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039
4,221519,NIO,PLUG NIO among premarket gainers,news,aTyr Pharma NASDAQ LIFE 63 on Kyorin Pharm...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096
...,...,...,...,...,...,...,...,...,...
221508,443024,T,Crude And Steel Still In Sync,opinion,We have been reporting on the trade off betwee...,2012-10-04,Ivan Kitov,https://www.investing.com/analysis/crude-and-s...,138733
221509,443025,T,Forget AT T This Is The Telecom Stock You Sho...,opinion,It s the largest cell phone provider in the wo...,2012-05-30,StreetAuthority,https://www.investing.com/analysis/forget-at-t...,124829
221510,443026,T,Wall Street Exposed Part 3 How Dividends C...,opinion,Before we dicuss how the mechanism of dividend...,2012-07-16,Portfolio Cafe,https://www.investing.com/analysis/wall-street...,129651
221511,443027,T,Weighing The Week Ahead It s All About Jobs,opinion,From start to finish the coming week will hav...,2012-09-02,Jeff Miller,https://www.investing.com/analysis/weighing-th...,134926


In [5]:
type(articles['content'].loc[0])

str

In [6]:
articles['content'].isnull().sum()

8

### Descriptive Statistics

221.513 (rows) articles. Every article entry has 9 columns, being uniquely identified with an id and the article's content in the content column. Moreover, there is a release_data column that can be used to differentiate between when the article was posted

release_date still has to be transformed to datetime, as it originally is of object Dtype

Missing values: There are 8 entries in the content column which have missing values, we drop these

In [7]:
articles = articles[articles['content'].notna()]

In [8]:
articles['content'].isnull().sum()

0

Check. No more missing entries in the content column.

### Related Articles

How many of the articles contain something related to Apple? (or APPL)

In [10]:
def preprocess_dataframe_content(df: pd.DataFrame) -> List[List[str]]:
    """
    Preprocesses each entry in the 'content' column of the given DataFrame by lowercasing,
    keeping only alphabetic characters, and filtering out words with length less than 3.
    Tokenizes the preprocessed entry into a list of words.
    Returns a list of lists where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.

    Returns:
    - list: A list of lists, where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.
    """
    return [
        [word for word in re.sub('[^a-zA-Z\s]', '', content.lower().strip()).split() if len(word) >= 3]
        for content in df['content']
    ]

In [11]:
processed_content = preprocess_dataframe_content(articles)

In [12]:
start_time = time.time()
apple_cbow_model = Word2Vec(
    sentences=processed_content,
    vector_size=500,
    window=5,
    min_count=1,
    sg=0,
    workers=8,
)
end_time = time.time()
elapsed_time = end_time - start_time
print(f'Training took: {elapsed_time} seconds ')

Training took: 265.1552131175995 seconds 


In [13]:
# saving the model, so we do not have to train it anymore
apple_cbow_model.save("../model/apple_cbow_model")

In [14]:
def filter_related_articles(processed_content: List[List[str]], articles_df: pd.DataFrame, model: Word2Vec, keywords: list, threshold: float = 0.5):
    """
    Filters articles related to the provided keywords based on cosine similarity
    between the articles and the keywords using a trained Word2Vec model.

    Parameters:
    - articles_df (pd.DataFrame): DataFrame containing the articles.
    - model (Word2Vec): Trained Word2Vec model.
    - keywords (list): List of keywords related to the topic of interest.
    - threshold (float): Cosine similarity threshold to consider an article as related.

    Returns:
    - pd.DataFrame: DataFrame containing the related articles.
    """
    # Preprocess the content column of the DataFrame.
    processed_content = processed_content

    # Calculate the vector representation for each keyword.
    keywords_vectors = [model.wv[keyword] for keyword in keywords if keyword in model.wv.index_to_key]

    # If no keyword is in the model's vocabulary, return an empty DataFrame.
    if not keywords_vectors:
        return pd.DataFrame(columns=articles_df.columns)

    # Calculate the average vector for keywords.
    average_keyword_vector = np.mean(keywords_vectors, axis=0)

    # Initialize a list to store the indices of the related articles.
    related_indices = []

    for index, article in enumerate(processed_content):
        # Get the vectors of the words in the article that are in the model's vocabulary.
        article_vectors = [model.wv[word] for word in article if word in model.wv.index_to_key]

        # Skip articles with no words in the model's vocabulary.
        if not article_vectors:
            continue

        # Calculate the average vector for the article.
        average_article_vector = np.mean(article_vectors, axis=0)

        # Calculate the cosine similarity between the article and the keywords.
        cosine_similarity = np.dot(average_keyword_vector, average_article_vector) / (np.linalg.norm(average_keyword_vector) * np.linalg.norm(average_article_vector))

        # If the similarity is above the threshold, store the index of the article.
        if cosine_similarity > threshold:
            related_indices.append(index)

    # Return a DataFrame containing only the related articles.
    return articles_df.iloc[related_indices]

# Usage:
keywords = ['apple', 'AAPL', 'iphone']
model = apple_cbow_model
related_articles_df = filter_related_articles(processed_content, articles, model, keywords, threshold=0.5)
