# **Initialization**

In [1]:
import pandas as pd
import re
import spacy
from collections import Counter
from textblob import TextBlob
# from google.colab import drive

from sklearn.feature_extraction.text import CountVectorizer
from numpy import array, log
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text

#### Mount google drive where the csv files are stored

In [2]:
# Mount Google Drive
# drive.mount('/content/drive', force_remount=True)

### Confirm if drive is mounted by listing the drive contents

In [3]:
# !ls "/content/drive/My Drive"

In [4]:
# news_data = pd.read_csv('/content/drive/My Drive/all_news_data/data.csv')
# traffic_data = pd.read_csv('/content/drive/My Drive/all_news_data/traffic.csv')
# domains_location_data = pd.read_csv('/content/drive/My Drive/all_news_data/domains_location.csv')

In [5]:
news_data = pd.read_csv('../data/data.csv')
traffic_data = pd.read_csv('../data/traffic.csv')
domains_location_data = pd.read_csv('../data/domains_location.csv')

In [6]:
news_data.columns

Index(['article_id', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'category', 'full_content'],
      dtype='object')

In [7]:
news_data.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
0,89541,,International Business Times,Paavan MATHEMA,UN Chief Urges World To 'Stop The Madness' Of ...,UN Secretary-General Antonio Guterres urged th...,https://www.ibtimes.com/un-chief-urges-world-s...,https://d.ibtimes.com/en/full/4496078/nepals-g...,2023-10-30 10:12:35.000000,UN Secretary-General Antonio Guterres urged th...,Nepal,UN Secretary-General Antonio Guterres urged th...
1,89542,,Prtimes.jp,,RANDEBOOよりワンランク上の大人っぽさが漂うニットとベストが新登場。,[株式会社Ainer]\nRANDEBOO（ランデブー）では2023年7月18日(火)より公...,https://prtimes.jp/main/html/rd/p/000000147.00...,https://prtimes.jp/i/32220/147/ogp/d32220-147-...,2023-10-06 04:40:02.000000,"RANDEBOO2023718()WEB2023 Autumn Winter \n""Nepa...",Nepal,
2,89543,,VOA News,webdesk@voanews.com (Agence France-Presse),UN Chief Urges World to 'Stop the Madness' of ...,UN Secretary-General Antonio Guterres urged th...,https://www.voanews.com/a/un-chief-urges-world...,https://gdb.voanews.com/01000000-0a00-0242-60f...,2023-10-30 10:53:30.000000,"Kathmandu, Nepal UN Secretary-General Antonio...",Nepal,
3,89545,,The Indian Express,Editorial,Sikkim warning: Hydroelectricity push must be ...,Ecologists caution against the adverse effects...,https://indianexpress.com/article/opinion/edit...,https://images.indianexpress.com/2023/10/edit-...,2023-10-06 01:20:24.000000,At least 14 persons lost their lives and more ...,Nepal,At least 14 persons lost their lives and more ...
4,89547,,The Times of Israel,Jacob Magid,"200 foreigners, dual nationals cut down in Ham...","France lost 35 citizens, Thailand 33, US 31, U...",https://www.timesofisrael.com/200-foreigners-d...,https://static.timesofisrael.com/www/uploads/2...,2023-10-27 01:08:34.000000,"Scores of foreign citizens were killed, taken ...",Nepal,


# **Data preprocessing**

In [8]:
# Function to preprocess source_name into source_id
def preprocess_source_id(source_name):
    # Convert to lowercase
    source_id = source_name.lower()
    # Replace spaces with hyphens
    source_id = source_id.replace(" ", "-")
    # Keep only alphanumeric characters, hyphens, brackets and full stops
    source_id = re.sub(r'[^\w\-.()]+', '', source_id)
    return source_id

# Apply the function to the source_name column to create the source_id column
news_data['source_id'] = news_data['source_name'].apply(preprocess_source_id)

In [9]:
# Preprocessing for author column missing values
news_data['author'].fillna('Unknown', inplace=True)

# Deal with full_content
news_data.dropna(subset=['full_content'], inplace=True)

# Since url_to_image is not important we can fill with placeholder
news_data['url_to_image'].fillna('http://example.com/placeholder.jpg', inplace=True)

# same with description column, use placeholder, unless otherwise important
news_data['description'].fillna('No description provided', inplace=True)

# Replace missing values in 'category' with 'Unknown'
news_data['category'].fillna('Unknown', inplace=True)

# Replace missing values in 'title' with 'No title provided'
news_data['title'].fillna('No title provided', inplace=True)

In [10]:
# Perform preprocessing and clean missing values on the domains_info dataset
domains_location_data.dropna(subset=['Country'], inplace=True)

### More Preprocessing, on date fields

In [11]:
# prompt: Using dataframe news_data: perform data preprocessing and cleaning

# Check for missing values
news_data.isnull().sum()

# Drop rows with missing values
news_data.dropna(inplace=True)

# Remove duplicate rows
news_data.drop_duplicates(inplace=True)

# Convert the 'published_at' column to datetime format
news_data['published_at'] = pd.to_datetime(news_data['published_at'], format='ISO8601')
# news_data['published_at'] = pd.to_datetime(news_data['published_at'])

# Extract the year from the 'published_at' column
news_data['year'] = news_data['published_at'].dt.year

# Group the data by year and calculate

ValueError: time data '2023-10-30 10:12:35.000000' does not match format 'ISO8601' (match)

In [None]:
news_data.head()

In [None]:
missing_values_news_data = news_data.isnull().sum()
print(missing_values_news_data)

In [None]:
missing_values_domains_location_data = domains_location_data.isnull().sum()
print(missing_values_domains_location_data)



In [12]:
missing_values_traffic_data = traffic_data.isnull().sum()
print(missing_values_traffic_data)

GlobalRank        0
TldRank           0
Domain            0
TLD               0
RefSubNets        0
RefIPs            0
IDN_Domain        0
IDN_TLD           0
PrevGlobalRank    0
PrevTldRank       0
PrevRefSubNets    0
PrevRefIPs        0
dtype: int64


In [13]:
domains_location_data.columns

Index(['SourceCommonName', 'location', 'Country'], dtype='object')

### Fix missing values

In [14]:
traffic_data.columns

Index(['GlobalRank', 'TldRank', 'Domain', 'TLD', 'RefSubNets', 'RefIPs',
       'IDN_Domain', 'IDN_TLD', 'PrevGlobalRank', 'PrevTldRank',
       'PrevRefSubNets', 'PrevRefIPs'],
      dtype='object')

In [15]:
news_data = news_data.dropna()
missing_values_news_data = news_data.isnull().sum()
print(missing_values_news_data)

article_id      0
source_id       0
source_name     0
author          0
title           0
description     0
url             0
url_to_image    0
published_at    0
content         0
category        0
full_content    0
dtype: int64


### Check outliers

In [16]:
# Check for missing values
news_data.isnull().all()

article_id      False
source_id       False
source_name     False
author          False
title           False
description     False
url             False
url_to_image    False
published_at    False
content         False
category        False
full_content    False
dtype: bool

In [17]:
len(news_data)

54889

In [18]:
domains_location_data.isnull().all()

SourceCommonName    False
location            False
Country             False
dtype: bool

In [19]:
domains_location_data.empty

False

In [20]:
news_data.columns

Index(['article_id', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'category', 'full_content'],
      dtype='object')

In [21]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54889 entries, 0 to 105374
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_id    54889 non-null  int64 
 1   source_id     54889 non-null  object
 2   source_name   54889 non-null  object
 3   author        54889 non-null  object
 4   title         54889 non-null  object
 5   description   54889 non-null  object
 6   url           54889 non-null  object
 7   url_to_image  54889 non-null  object
 8   published_at  54889 non-null  object
 9   content       54889 non-null  object
 10  category      54889 non-null  object
 11  full_content  54889 non-null  object
dtypes: int64(1), object(11)
memory usage: 5.4+ MB


In [22]:
news_data.describe()

Unnamed: 0,article_id
count,54889.0
mean,305940.568019
std,220864.360813
min,418.0
25%,105821.0
50%,271035.0
75%,469868.0
max,781308.0


# **NLP**

## Keyword Extraction with TF-IDF

In [108]:
def extract_keywords_custom_tfidf(text_data, max_features=10):
    # Use the 'article' column as the text data
    array_text = text_data.dropna().tolist()

     # Extend the stop words used by CountVectorizer
    stop_words = text.ENGLISH_STOP_WORDS.union(['said', 'new'])

    # Convert the set of stop words to a list
    stop_words = list(stop_words)

    # Initialize the CountVectorizer with stop_words set to 'english'
    vectorizer = CountVectorizer(max_features=max_features, stop_words=stop_words)
    
    # Fit the vectorizer on the text data
    tf = vectorizer.fit_transform([x.lower() for x in array_text])
    tf = tf.toarray()
    tf = log(tf + 1)

    # Compute IDF values
    df = pd.DataFrame(tf, columns=vectorizer.get_feature_names_out())
    # calculates the Inverse Document Frequency (IDF) for each word in the text data.
    idf = (len(array_text) / (df > 0).sum()).apply(log)

    # We are ready to multiply the TF and IDF values to get the TF-IDF values.
    tfidf = tf.copy()
    words = array(vectorizer.get_feature_names_out())

    for word in words:
        tfidf[:, words == word] = tfidf[:, words == word] * idf[word]

    keywords = []
    for j in range(tfidf.shape[0]):
        # Get the top 5 words with the highest TF-IDF values
        top_words = [words[i] for i in tfidf[j].argsort()[-5:][::-1]]
        keywords.append(top_words)

    return keywords

### Extract keywords for the first 10 and last 10 articles' title and content


In [109]:
# Extract keywords for the first 10 and last 10 articles' title and content
limited_news_data = pd.concat([news_data.head(10), news_data.tail(10)])
limited_news_data['title_keywords'] = extract_keywords_custom_tfidf(limited_news_data['title'])
limited_news_data['content_keywords'] = extract_keywords_custom_tfidf(limited_news_data['full_content'])

#### Check the added columns of keywords

In [112]:
limited_news_data.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content,title_keywords,content_keywords
0,89541,international-business-times,International Business Times,Paavan MATHEMA,UN Chief Urges World To 'Stop The Madness' Of ...,UN Secretary-General Antonio Guterres urged th...,https://www.ibtimes.com/un-chief-urges-world-s...,https://d.ibtimes.com/en/full/4496078/nepals-g...,2023-10-30 10:12:35.000000,UN Secretary-General Antonio Guterres urged th...,Nepal,UN Secretary-General Antonio Guterres urged th...,"[world, climate, chief, change, pay]","[world, people, million, state, israel]"
3,89545,the-indian-express,The Indian Express,Editorial,Sikkim warning: Hydroelectricity push must be ...,Ecologists caution against the adverse effects...,https://indianexpress.com/article/opinion/edit...,https://images.indianexpress.com/2023/10/edit-...,2023-10-06 01:20:24.000000,At least 14 persons lost their lives and more ...,Nepal,At least 14 persons lost their lives and more ...,"[world, pay, party, palestine, india]","[state, indian, government, people, world]"
6,89551,al-jazeera-english,Al Jazeera English,Kaushik Raj,Pro-Israel rallies allowed in India but Palest...,"India, the first non-Arab country to recognise...",https://www.aljazeera.com/news/2023/10/25/pro-...,https://www.aljazeera.com/wp-content/uploads/2...,2023-10-25 09:58:17.000000,"New Delhi, India Israels relentless bombing of...",Nepal,"India, the first non-Arab country to recognise...","[palestine, india, world, pay, party]","[israel, india, state, government, indian]"
7,89555,the-indian-express,The Indian Express,New York Times,No nation in the world is buying more planes t...,India's largest airlines have ordered nearly 1...,https://indianexpress.com/article/business/avi...,https://images.indianexpress.com/2023/11/igiai...,2023-11-02 05:48:58.000000,No nation in the world is buying as many airpl...,Nepal,Written by Alex Travelli and Hari Kumar No nat...,"[world, india, pay, party, palestine]","[india, million, indian, 000, world]"
12,89563,the-times-of-india,The Times of India,Durgesh Nandan Jha,PM Hasina’s war on terror gets daughter India’...,India News: NEW DELHI: India preferred Banglad...,https://timesofindia.indiatimes.com/india/pm-h...,"https://static.toiimg.com/thumb/msid-47529300,...",2023-11-02 01:12:47.000000,Ranked! Worlds most loved landmarks; Taj Mahal...,Nepal,NEW DELHI: India preferred Bangladesh over Nep...,"[india, world, pay, party, palestine]","[india, world, state, people, million]"


### Similarity of keywords between title and keywords in the news

In [125]:
# Convert the list of keywords for each article into a string
limited_news_data['title_keywords_str'] = limited_news_data['title_keywords'].apply(' '.join)
limited_news_data['content_keywords_str'] = limited_news_data['content_keywords'].apply(' '.join)

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Convert the keywords into TF-IDF vectors
title_tfidf = vectorizer.fit_transform(limited_news_data['title_keywords_str'])
content_tfidf = vectorizer.transform(limited_news_data['content_keywords_str'])

# Calculate the cosine similarity between the title and content keywords for each article
similarity_scores = cosine_similarity(title_tfidf, content_tfidf)

# Print the similarity scores
print("The similarity score between keywords in the title and content of the articles is as follows:")
for i, score in enumerate(similarity_scores):
    print(f"article {i+1}: {score[0]}")

The similarity score between keywords in the title and content of the articles is as follows:
article 1: 0.18886369489717672
article 2: 0.41171810764997363
article 3: 0.41171810764997363
article 4: 0.41171810764997363
article 5: 0.41171810764997363
article 6: 0.22109649138855314
article 7: 0.41171810764997363
article 8: 0.41171810764997363
article 9: 0.25158568922967073
article 10: 0.41171810764997363
article 11: 0.2761797570583058
article 12: 0.41171810764997363
article 13: 0.41171810764997363
article 14: 0.41171810764997363
article 15: 0.41171810764997363
article 16: 0.41171810764997363
article 17: 0.41171810764997363
article 18: 0.41171810764997363
article 19: 0.2526691720356441
article 20: 0.41171810764997363


## Categorize the title/content into known set of topic categories

In [17]:
import os
import shutil

# Define the path of the cache directory
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "models")

# Define the name of the model directory to delete
model_dir = "sentence-transformers_all-MiniLM-L6-v2"

# Define the full path of the model directory
full_model_dir = os.path.join(cache_dir, model_dir)

# Delete the model directory
shutil.rmtree(full_model_dir, ignore_errors=True)

# Now, you can re-run your BERTopic code

In [19]:
# Import the necessary libraries
from bertopic import BERTopic

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/hilla/anaconda3/envs/10academyw0/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_17057/3998661163.py", line 11, in <module>
    topics, probs = topic_model.fit_transform(docs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hilla/anaconda3/envs/10academyw0/lib/python3.12/site-packages/bertopic/_bertopic.py", line 433, in fit_transform
    self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)
  File "/home/hilla/anaconda3/envs/10academyw0/lib/python3.12/site-packages/bertopic/_bertopic.py", line 3636, in _extract_topics
    self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hilla/anaconda3/envs/10academyw0/lib/python3.12/site-packages/bertopic/_bertopic.py", line 3835, in _c_tf_idf
    self.vectoriz

# Analyze topics and trends

In [157]:
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from collections import Counter

# Load the data
docs = news_data['content'].values.tolist()

# Create an instance of BERTopic
topic_model = BERTopic()

# Fit BERTopic to the news document and transform the news documents into topics
topics, _ = topic_model.fit_transform(docs)

# Add topics to the original dataframe
news_data['topic'] = topics

# Analyze the topics and trends
# Determine which websites reported the most diverse topics
website_topic_counts = news_data.groupby('source_name')['topic'].nunique().sort_values(ascending=False)
print("Websites with the most diverse topics:")
print(website_topic_counts)

# Plot a 2D scatter plot to visualize the trends
# Convert date to datetime format and extract date only, assuming 'date' is your date column
news_data['date'] = pd.to_datetime(news_data['date']).dt.date

# Count the number of each topic on each date
topic_counts = news_data.groupby(['date', 'topic']).size().reset_index(name='count')

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=topic_counts, x='date', y='topic', size='count', legend=False, sizes=(20, 2000))
plt.xlabel('Date')
plt.ylabel('Topic')
plt.title('Topic Trends Over Time')
plt.show()

KeyboardInterrupt: 

# Categorization of Headlines into the following tags:

## The tags include:
1. Breaking News
2. Politics
3. World News
4. Business/Finance
5. Technology
6. Science
7. Health
8. Entertainment
9. Sports
10. Environment
11. Crime
12. Education
13. Weather
14. Other


In [45]:
# Define the tags and corresponding keywords
tags = {
    "Breaking News": ["breaking", "alert"],
    "Politics": ["politics", "government", "election"],
    "World News": ["world", "international"],
    "Business/Finance": ["business", "finance", "economy", "stock", "market"],
    "Technology": ["technology", "tech", "innovation", "digital"],
    "Science": ["science", "research", "discovery"],
    "Health": ["health", "medical", "disease"],
    "Entertainment": ["entertainment", "celebrity", "hollywood"],
    "Sports": ["sports", "game", "competition"],
    "Environment": ["environment", "climate", "nature"],
    "Crime": ["crime", "law"],
    "Education": ["education", "school", "learning"],
    "Weather": ["weather", "forecast", "climate"],
    "Other": []
}

In [49]:
# Function to categorize the headlines into tags
def categorize_headlines(headlines, tags):
    # Initialize an empty list to store the categories
    categories = []

    # Iterate through the headlines
    for headline in headlines:
        # Convert the headline to lowercase
        headline = headline.lower()

        # Initialize a list to store the tags for the headline
        headline_tags = []

        # Iterate through the tags
        for tag, keywords in tags.items():
            # Check if any keyword for the tag is present in the headline
            if any(keyword in headline for keyword in keywords):
                return tag

        # If no tags were found, assign the "Other" tag
        if not headline_tags:
            return "Other"

        # Add the tags for the headline to the categories list
        categories.append(headline_tags)

    return categories

In [50]:
# Categorize the headlines into tags
news_data['tags'] = categorize_headlines(news_data['title'], tags)


In [51]:
news_data.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content,tags
0,89541,international-business-times,International Business Times,Paavan MATHEMA,UN Chief Urges World To 'Stop The Madness' Of ...,UN Secretary-General Antonio Guterres urged th...,https://www.ibtimes.com/un-chief-urges-world-s...,https://d.ibtimes.com/en/full/4496078/nepals-g...,2023-10-30 10:12:35.000000,UN Secretary-General Antonio Guterres urged th...,Nepal,UN Secretary-General Antonio Guterres urged th...,World News
3,89545,the-indian-express,The Indian Express,Editorial,Sikkim warning: Hydroelectricity push must be ...,Ecologists caution against the adverse effects...,https://indianexpress.com/article/opinion/edit...,https://images.indianexpress.com/2023/10/edit-...,2023-10-06 01:20:24.000000,At least 14 persons lost their lives and more ...,Nepal,At least 14 persons lost their lives and more ...,World News
6,89551,al-jazeera-english,Al Jazeera English,Kaushik Raj,Pro-Israel rallies allowed in India but Palest...,"India, the first non-Arab country to recognise...",https://www.aljazeera.com/news/2023/10/25/pro-...,https://www.aljazeera.com/wp-content/uploads/2...,2023-10-25 09:58:17.000000,"New Delhi, India Israels relentless bombing of...",Nepal,"India, the first non-Arab country to recognise...",World News
7,89555,the-indian-express,The Indian Express,New York Times,No nation in the world is buying more planes t...,India's largest airlines have ordered nearly 1...,https://indianexpress.com/article/business/avi...,https://images.indianexpress.com/2023/11/igiai...,2023-11-02 05:48:58.000000,No nation in the world is buying as many airpl...,Nepal,Written by Alex Travelli and Hari Kumar No nat...,World News
12,89563,the-times-of-india,The Times of India,Durgesh Nandan Jha,PM Hasina’s war on terror gets daughter India’...,India News: NEW DELHI: India preferred Banglad...,https://timesofindia.indiatimes.com/india/pm-h...,"https://static.toiimg.com/thumb/msid-47529300,...",2023-11-02 01:12:47.000000,Ranked! Worlds most loved landmarks; Taj Mahal...,Nepal,NEW DELHI: India preferred Bangladesh over Nep...,World News


# Topic Modelling



## Preprocessing the text data

#### REDUCE THE SIZE OF THE DATASET FOR TESTING PURPOSES


In [23]:
fraction = 0.16

# Randomly sample the fraction of data
news_data_sample = news_data.sample(frac=fraction)

In [24]:
news_data_sample.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content
75120,296048,deadline,Deadline,Jesse Whittock,Sales Bonanza: British TV Show Exports Fetch R...,British TV shows fetched a record £1.85B ($2.2...,https://deadline.com/2023/11/british-tv-shows-...,https://deadline.com/wp-content/uploads/2021/0...,2023-11-13 01:01:00,British TV shows fetched a record £1.85B ($2.2...,COVID,British TV shows fetched a record £1.85B ($2.2...
89538,456681,abc-news,ABC News,JOE REEDY AP sports writer,Remaining schedule and Burrow's injury provide...,Even before Joe Burrow’s season-ending wrist i...,https://abcnews.go.com/Politics/wireStory/rema...,https://i.abcnewsfe.com/a/205733e5-ce8d-400d-b...,2023-11-18 18:46:46,Even before Joe Burrow's season-ending wrist i...,Jordan,Even before Joe Burrow's season-ending wrist i...
58837,140524,etf-daily-news,ETF Daily News,MarketBeat News,"Versor Investments LP Acquires Shares of 11,11...",Versor Investments LP purchased a new stake in...,https://www.etfdailynews.com/2023/11/04/versor...,https://www.americanbankingnews.com/wp-content...,2023-11-04 14:40:44,Versor Investments LP purchased a new stake in...,Technology,Versor Investments LP purchased a new stake in...
12422,115040,marketscreener.com,Marketscreener.com,Unknown,Equifax Board of Directors Declares Quarterly ...,"(marketscreener.com) \n\nATLANTA, Nov. 2, 2023...",https://www.marketscreener.com/quote/stock/EQU...,https://www.marketscreener.com/images/twitter_...,2023-11-02 16:26:03.000000,"ATLANTA, Nov. 2, 2023 /PRNewswire/ -- Equifax\...",Asia,"ATLANTA,Nov. 2, 2023/PRNewswire/ --Equifax® (N..."
80462,330617,rt,RT,RT,EU state preparing to shut border with Russia,Finland is considering closing border checkpoi...,https://www.rt.com/news/587254-finland-prepari...,https://mf.b37mrtl.ru/files/2023.11/article/65...,2023-11-14 13:56:11,Finland could soon close border checkpoints wi...,Health,Finland could soon close border checkpoints wi...


In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk

# Download the NLTK stopwords and WordNet lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/hilla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hilla/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
# Step 1: Preprocess the text
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean_text(text):
    # check if the input is a string
    if not isinstance(text, str):
        return ""
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [27]:
doc_clean = [clean_text(doc).split() for doc in news_data_sample['title']]

## Topic Modelling using LDA

1. Preprocess the text: This involves removing punctuation, lowercasing, tokenizing, removing stop words, and stemming/lemmatizing.
2. Vectorize the text: Convert the documents into a document-term matrix, where each row represents a document and each column represents a term (word).
3. Apply LDA: Use the LatentDirichletAllocation class from sklearn.decomposition to fit the LDA model on the document-term matrix.

### Step 1: Preprocess the text

### Step 2: Vectorize the text

In [28]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum required occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=50000,               # max number of unique words
                            )

In [29]:
data_vectorized = vectorizer.fit_transform(news_data_sample['title'])

### Step 3: Create, Appy and Train LDA model

In [30]:
# Define the number of topics
num_topics = 10

# Create the LDA model
lda_model = LatentDirichletAllocation(n_components=num_topics,               # Number of topics
                                      max_iter=10,                           # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,                      # Random state
                                      batch_size=128,                        # n docs in each learning iter
                                      evaluate_every = -1,                   # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,                           # Use all available CPUs
                                     )

In [31]:
lda_output = lda_model.fit_transform(data_vectorized)

In [32]:
# Print the topics found by the LDA model
for index, topic in enumerate(lda_model.components_):
    print(f'Top 10 words for topic #{index}:')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['chief', 'bought', 'director', 'reports', 'head', 'cup', 'financial', '000', 'world', 'india']


Top 10 words for topic #1:
['international', 'target', 'new', 'shares', 'sells', 'price', 'group', 'investment', 'nyse', 'nasdaq']


Top 10 words for topic #2:
['israeli', 'climate', 'health', 'russia', 'ukraine', 'hamas', 'war', 'gaza', 'says', 'israel']


Top 10 words for topic #3:
['acquires', 'sold', 'energy', 'wealth', 'capital', 'nasdaq', 'llc', 'management', 'nyse', 'shares']


Top 10 words for topic #4:
['coverage', 'hold', 'russian', 'security', 'trust', 'nyse', 'dividend', 'asset', 'stocknews', 'com']


Top 10 words for topic #5:
['chinese', 'increases', 'meeting', 'short', 'october', 'buys', 'set', 'stake', 'otcmkts', 'new']


Top 10 words for topic #6:
['black', 'update', 'llc', 'nysearca', 'advisors', 'etf', 'nasdaq', 'holdings', 'stock', 'million']


Top 10 words for topic #7:
['military', 'south', 'lon', 'data', 'time', 'canada', 'bank', 'plc', 'ye

### Step 4: Assign topics to documents

In [33]:
# Categorize the title into known set of topic categories
import numpy as np

topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
docnames = ["Doc" + str(i) for i in range(len(news_data_sample['title']))]
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
dominant_topic = np.argmax(df_document_topic.values, axis=1)
news_data_sample['dominant_topic'] = dominant_topic

In [34]:
news_data_sample.head()

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content,dominant_topic
75120,296048,deadline,Deadline,Jesse Whittock,Sales Bonanza: British TV Show Exports Fetch R...,British TV shows fetched a record £1.85B ($2.2...,https://deadline.com/2023/11/british-tv-shows-...,https://deadline.com/wp-content/uploads/2021/0...,2023-11-13 01:01:00,British TV shows fetched a record £1.85B ($2.2...,COVID,British TV shows fetched a record £1.85B ($2.2...,6
89538,456681,abc-news,ABC News,JOE REEDY AP sports writer,Remaining schedule and Burrow's injury provide...,Even before Joe Burrow’s season-ending wrist i...,https://abcnews.go.com/Politics/wireStory/rema...,https://i.abcnewsfe.com/a/205733e5-ce8d-400d-b...,2023-11-18 18:46:46,Even before Joe Burrow's season-ending wrist i...,Jordan,Even before Joe Burrow's season-ending wrist i...,4
58837,140524,etf-daily-news,ETF Daily News,MarketBeat News,"Versor Investments LP Acquires Shares of 11,11...",Versor Investments LP purchased a new stake in...,https://www.etfdailynews.com/2023/11/04/versor...,https://www.americanbankingnews.com/wp-content...,2023-11-04 14:40:44,Versor Investments LP purchased a new stake in...,Technology,Versor Investments LP purchased a new stake in...,3
12422,115040,marketscreener.com,Marketscreener.com,Unknown,Equifax Board of Directors Declares Quarterly ...,"(marketscreener.com) \n\nATLANTA, Nov. 2, 2023...",https://www.marketscreener.com/quote/stock/EQU...,https://www.marketscreener.com/images/twitter_...,2023-11-02 16:26:03.000000,"ATLANTA, Nov. 2, 2023 /PRNewswire/ -- Equifax\...",Asia,"ATLANTA,Nov. 2, 2023/PRNewswire/ --Equifax® (N...",4
80462,330617,rt,RT,RT,EU state preparing to shut border with Russia,Finland is considering closing border checkpoi...,https://www.rt.com/news/587254-finland-prepari...,https://mf.b37mrtl.ru/files/2023.11/article/65...,2023-11-14 13:56:11,Finland could soon close border checkpoints wi...,Health,Finland could soon close border checkpoints wi...,3


In [79]:
### BERTopic modelling

## Topic Modelling using BERTopic

### Step 1: Preprocess the text for BERTopic

- The difference in preprocessing for BERTopic from LDA is we don't need to split the text into words

In [134]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk

# Download the NLTK stopwords and WordNet lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/hilla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hilla/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [135]:
# Step 1: Preprocess the text
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean_text(text):
    # check if the input is a string
    if not isinstance(text, str):
        return ""
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

#### The difference in preprocessing from LDA is in this line of code, we don't need to split the text into words

In [136]:
doc_clean = [clean_text(doc) for doc in news_data_sample['title']]

In [137]:
from bertopic import BERTopic

# Create an instance of BERTopic
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

In [138]:
# Fit BERTopic to the news and transform the news into topics
topics, probs = topic_model.fit_transform(doc_clean)

2024-04-10 21:00:32,069 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/275 [00:00<?, ?it/s]

2024-04-10 21:01:16,418 - BERTopic - Embedding - Completed ✓
2024-04-10 21:01:16,418 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-10 21:01:26,999 - BERTopic - Dimensionality - Completed ✓
2024-04-10 21:01:27,000 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-10 21:01:33,426 - BERTopic - Cluster - Completed ✓
2024-04-10 21:01:33,430 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-10 21:01:33,796 - BERTopic - Representation - Completed ✓


In [139]:
# Get an overview of the topics
topic_overview = topic_model.get_topic_info()

In [140]:
topic_model.get_topic(0)   # Select the most frequent topic

[('ukraine', 0.07475591815522657),
 ('russia', 0.05374129870118792),
 ('russian', 0.04930142161778385),
 ('ukrainian', 0.0340223053777114),
 ('eu', 0.03240442770154239),
 ('minister', 0.02462194611238183),
 ('foreign', 0.023255089150752105),
 ('putin', 0.018998309692766627),
 ('nato', 0.017985486273167376),
 ('drone', 0.017360945573887447)]

In [141]:
### Attributes of the BERTopic model

In [142]:
topic_model.topics_[:10]

[68, 6, 21, 71, 40, -1, -1, 40, -1, 10]

### Visualizing the topics

In [143]:
topic_model.visualize_topics()

# Model the events that the news articles are written about

## Model the events that the articles are covering

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from collections import Counter
import spacy

In [146]:
# Load Spacy's English model
nlp = spacy.load('en_core_web_sm')


In [148]:
# Preprocess the data
news_data_sample['title'] = news_data_sample['title'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_punct and not token.like_num]))


In [149]:
# Extract features from the text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(news_data_sample['title'])


In [150]:
# Cluster the articles
clustering = AgglomerativeClustering(n_clusters=500).fit(X.toarray())

In [151]:
# Add the cluster labels to the dataframe
news_data_sample['cluster'] = clustering.labels_

In [152]:
# Label the clusters
cluster_labels = []
for cluster in range(500):
    # Get the articles in this cluster
    articles = news_data_sample[news_data_sample['cluster'] == cluster]['title']

    # Combine the articles into one long string
    text = ' '.join(articles)

    # Get the most common words in this text
    words = Counter(text.split())
    most_common_words = words.most_common(5)

    # Use the most common words as the cluster label
    label = ' '.join([word for word, _ in most_common_words])
    cluster_labels.append(label)
    

In [153]:
news_data_sample['event'] = news_data_sample['cluster'].apply(lambda x: cluster_labels[x])

In [158]:
news_data_sample.head(20)

Unnamed: 0,article_id,source_id,source_name,author,title,description,url,url_to_image,published_at,content,category,full_content,cluster,event
537,90319,rt,RT,RT,India walk tightrope Israel Palestine conflict,India has to walk a tightrope on the Israel-Pa...,https://www.rt.com/india/585959-india-mahatma-...,https://mf.b37mrtl.ru/files/2023.10/article/65...,2023-10-28 02:59:45.000000,A Saudi Arabian prince recently asked the peop...,Oman,A Saudi Arabian prince recently asked the peop...,463,Israel Palestine conflict India walk
90822,476902,etf-daily-news,ETF Daily News,MarketBeat News,Quadrant Capital Group LLC Sells share Public ...,Quadrant Capital Group LLC trimmed its positio...,https://www.etfdailynews.com/2023/11/19/quadra...,https://www.americanbankingnews.com/wp-content...,2023-11-19 14:52:41,Quadrant Capital Group LLC trimmed its positio...,Real estate,Quadrant Capital Group LLC trimmed its positio...,95,Storage NYSE Inc. Public PSA
77031,318713,npr,NPR,Steve Inskeep,patient doctor Gaza hospital catch Israel camp...,Israeli ground forces are battling Hamas on th...,https://www.npr.org/2023/11/13/1212589319/pati...,https://media.npr.org/include/images/facebook-...,2023-11-13 10:10:37,Israeli ground forces are battling Hamas on th...,Israel,Israeli forces are battling Hamas on the stree...,467,Gaza hospital Israel Hospital strike
73526,274770,npr,NPR,Alana Wise,Veterans want ceasefire Gaza,NPR spoke to veterans ahead of the Saturday ho...,https://www.npr.org/2023/11/11/1212429107/some...,https://media.npr.org/include/images/facebook-...,2023-11-11 12:13:05,American flags are on display at the World War...,Afghanistan,A fireball erupts during Israeli bombardment i...,25,Gaza ceasefire UN warn call
75407,299257,etf-daily-news,ETF Daily News,MarketBeat News,SunCar Technology Group NASDAQ SDA share Gap $,SunCar Technology Group Inc. (NASDAQ:SDA – Get...,https://www.etfdailynews.com/2023/11/13/suncar...,https://www.americanbankingnews.com/wp-content...,2023-11-13 16:46:53,SunCar Technology Group Inc. (NASDAQ:SDA – Get...,Technology,SunCar Technology Group Inc. (NASDAQ:SDA–Get F...,186,Gap share $ NYSE NASDAQ
105059,773393,globenewswire,GlobeNewswire,"Magnite, Inc.",Magnite Research Highlights Opportunity brand ...,69% of TV viewers in Indonesia watch ad-suppor...,https://www.globenewswire.com/news-release/202...,http://example.com/placeholder.jpg,2023-11-29 04:00:00,"JAKARTA, Indonesia, Nov. 28, 2023 (GLOBE NEWSW...",Indonesia,"JAKARTA, Indonesia, Nov. 28, 2023 (GLOBE NEW...",17,$ say India year New
42753,62024,international-business-times,International Business Times,AFP News,heist Sea Shrimp Bandits terrorize Ecuador Far...,From robberies on land to hold-ups at sea by r...,https://www.ibtimes.com/heists-sea-shrimp-band...,https://d.ibtimes.com/en/full/4490992/shrimper...,2023-10-06 04:57:30.000000,From robberies on land to hold-ups at sea by r...,Ecuador,From robberies on land to hold-ups at sea by r...,17,$ say India year New
90389,504520,etf-daily-news,ETF Daily News,MarketBeat News,Corporación América Airports NYSE CAAP share G...,Corporación América Airports S.A. (NYSE:CAAP –...,https://www.etfdailynews.com/2023/11/18/corpor...,https://www.americanbankingnews.com/wp-content...,2023-11-18 13:36:45,Corporación América Airports S.A. (NYSE:CAAP –...,Europe,Corporación América Airports S.A. (NYSE:CAAP–G...,186,Gap share $ NYSE NASDAQ
46292,72258,deadline,Deadline,Rosy Cordero,Duncan Crabtree Ireland Latest Delay negotiati...,While walking the picket line Thursday outside...,https://deadline.com/2023/10/duncan-crabtree-i...,https://deadline.com/wp-content/uploads/2023/1...,2023-10-12 19:29:37.000000,While walking the picket line Thursday outside...,Ireland,While walking the picket line Thursday outside...,111,strike Ireland Healthcare Northern UAW
94543,522763,globenewswire,GlobeNewswire,Trifork Holding AG,23/2023・reporting transaction person discharge...,Company announcement no. 23 / 2023 Schindelle...,https://www.globenewswire.com/news-release/202...,https://ml-eu.globenewswire.com/Resource/Downl...,2023-11-20 18:42:00,Company announcement no. 23 / 2023\r\nSchindel...,Startups,Company announcement no. 23 / 2023 Schindelleg...,17,$ say India year New


In [159]:
# The number of events covered in the data
num_events = news_data_sample['event'].nunique()
print(f"Number of events: {num_events}")

Number of events: 500


In [162]:
# Analyse the news sites that report events the earliest
news_data_sample['date'] = pd.to_datetime(news_data_sample['published_at'])  # Ensure the date column is in datetime format
earliest_reporting = news_data_sample.sort_values('date').groupby('event').first()['source_name']
print("News sites that reported each event the earliest:")
print(earliest_reporting)

News sites that reported each event the earliest:
event
$ Edwards Lifesciences NYSE EW              ETF Daily News
$ Fund NYSE Dividend Monthly                ETF Daily News
$ Meta Platforms Inc. NASDAQ                ETF Daily News
$ New Low year hit                                     CNA
$ say India year New                                    RT
                                               ...        
way Israel level pave war                 Business Insider
well poverty kill Amazon rainforest       Business Insider
woman insult strike sexual assault        Business Insider
work teacher practice Remote Tools      The Indian Express
worker protest Bangladesh wage labor              BBC News
Name: source_name, Length: 500, dtype: object


In [164]:
# Events with the highest reporting
event_counts = news_data_sample['event'].value_counts()
print("Events with the highest reporting:")
print(event_counts)

Events with the highest reporting:
$ say India year New                                1709
new New say reveal bring                             208
support join Israel political benefit                100
Israel Gaza Hamas war say                             92
China Asia central say asian                          83
                                                    ... 
upward adjustment expectation                          2
palliative Osun IPAC suspend deputy                    2
Australia apologize thalidomide tragedy survivor       2
Innofactor Oyj Omien osakkeiden hankinta               2
Ignitis Group confirm winner MW                        2
Name: event, Length: 500, dtype: int64


In [166]:
# The correlation between news sites reporting events?
# Create a matrix where each cell represents the number of common events reported by a pair of news sites
correlation_matrix = pd.crosstab(news_data_sample['source_name'], news_data_sample['event'])
correlation_matrix = correlation_matrix.T.dot(correlation_matrix)
print("Correlation matrix of news sites reporting events:")
print(correlation_matrix)

Correlation matrix of news sites reporting events:
event                                 $ Edwards Lifesciences NYSE EW  \
event                                                                  
$ Edwards Lifesciences NYSE EW                                     9   
$ Fund NYSE Dividend Monthly                                      36   
$ Meta Platforms Inc. NASDAQ                                       9   
$ New Low year hit                                                72   
$ say India year New                                              96   
...                                                              ...   
way Israel level pave war                                          0   
well poverty kill Amazon rainforest                                0   
woman insult strike sexual assault                                 0   
work teacher practice Remote Tools                                 0   
worker protest Bangladesh wage labor                               0   

event       