In [12]:
import pandas as pd
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from textblob import TextBlob
import torch
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/huiyang.han/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
# Load the data
test_data = pd.read_csv('data/processed_articles.csv')
test_data['content'] = test_data['title'] + test_data['body']
test_data = test_data.drop(columns=['title','body'])
test_data['date'] = pd.to_datetime(test_data['date'])
test_data = test_data.sort_values(by='date')
test_data['content'] = test_data['content'].apply(lambda x: ' '.join(x.split(',')))

In [14]:
# create a list of stop words
stop_words = set(stopwords.words('english'))
stop_words_list = ['aviva','company','shenzhen','kate','euros','emirate','dhabi','metre','asia','europe','shanghai','nichola','roger','msci','iran','states','italy','united','china','usa','us','america','american','americans','chinese','china','russia','russian','putin','vladimir','trump','donald','biden','joe','ukraine','ukrainian','ukrainians','ukraines','ukraine','say','jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec','mon','tue','wed','thu','fri','sat','sun','monday','tuesday','wednesday','thursday','friday','saturday','sunday','january','february','march','april','may','june','july','august','september','october','november','december','today','yesterday','tomorrow','week','month','year','time','day','weekend','morning','afternoon','evening','night','news','new','news']
list_numbers = ['eoi','name','houthi','uae','euro','yen','instead','liga','len','nhl','one','two','three','four','five','six','seven','eight','nine','ten','hundred','thousand','million','billion','trillion','first','second','third','eighted','series','hong','kong','new','york','los','angeles','san','francisco','las','vegas','san','diego','san','jose']
country_list = ['myanmar','robert','lebanon','iivi','william','zalando','olympic','country','world','africa','China', 'United States of America', 'United Kingdom', 'France', 'Germany', 'Japan', 'Russia', 'Australia', 'Canada', 'India', 'Brazil', 'Italy', 'Spain', 'South Korea', 'Mexico', 'Netherlands', 'Switzerland', 'Sweden', 'Norway', 'Denmark', 'Finland', 'Greece', 'Ireland', 'Portugal', 'Poland', 'Ukraine', 'Romania', 'Belgium', 'Austria', 'Turkey', 'Saudi Arabia', 'United Arab Emirates', 'Iran', 'Iraq', 'Israel', 'Egypt', 'South Africa', 'Argentina', 'Venezuela', 'Thailand', 'Malaysia', 'Singapore', 'Indonesia', 'Philippines', 'Pakistan', 'Bangladesh', 'Nigeria', 'Kenya', 'Tanzania', 'Uganda']
list_append = ["whose", "german", "saidsign", "ceo","exar","chos","sme", "vietnam", "gsk", "mori", "queen", "threeyear","would", "come", "also", "could", "edit", "include","pitch", "Britain", "Indian", "collin", "koo", "skorea", "men", "koo", "hub","bbva", "korea", "inc", "btp", "ntpcs", "telecom", "omi","jen","andre", "spac", "sabadell","faa", "unicredit", "city", "georgia", "puma", "philip", "england", "tokyo", "announce", "safrica", "andrea"]
stop_words.update(stop_words_list)
stop_words.update(list_numbers)
stop_words.update(country_list)
stop_words.update(list_append)
test_data['content'] = test_data['content'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop_words)]))
test_data['content'] = test_data['content'].apply(lambda x: ' '.join([word for word in x.split() if word.isalpha()]))
test_data['content'] = test_data['content'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))
test_data['content'] = test_data['content'].apply(lambda x: ' '.join([word for word in x.split() if word != 'nan']))
test_data['content_tokens'] = test_data['content'].apply(lambda x: x.split())

In [15]:
# delete the date with less than 60 samples, output the data which can be used for the recollection
start_date = '2021-01-01'
end_date = '2021-12-31'
test_data = test_data[(test_data['date'] >= start_date) & (test_data['date'] <= end_date)]
test_data
sample_counts = test_data['date'].value_counts()

# find the dates with more than 60 samples and keep them as the test data
valid_dates = sample_counts[sample_counts >= 30].index
test_data = test_data[test_data['date'].isin(valid_dates)]
test_data

Unnamed: 0,date,content,content_tokens
36018,2021-01-01,bomb cyclone batter alaskas aleutian island hu...,"[bomb, cyclone, batter, alaskas, aleutian, isl..."
36085,2021-01-01,pakistan inflation rate easkarachi pakistan pa...,"[pakistan, inflation, rate, easkarachi, pakist..."
36084,2021-01-01,australias batsman must grind foil india planm...,"[australias, batsman, must, grind, foil, india..."
36083,2021-01-01,saint clear end isolation ahead liverpool clas...,"[saint, clear, end, isolation, ahead, liverpoo..."
36082,2021-01-01,infectious covid variant find florida state of...,"[infectious, covid, variant, find, florida, st..."
...,...,...,...
36652,2021-12-30,french town record covid case hinder drivemeau...,"[french, town, record, covid, case, hinder, dr..."
36618,2021-12-30,booster slash omicron safrican studycape town ...,"[booster, slash, omicron, safrican, studycape,..."
36619,2021-12-30,dow close still poise big annual gainsummarydo...,"[dow, close, still, poise, big, annual, gainsu..."
36614,2021-12-30,insight woman force change indian iphone plant...,"[insight, woman, force, change, indian, iphone..."


In [16]:
# Define parameter search space
vector_sizes = [70, 80, 90]
windows = [ 6,7, 8]
min_counts = [2,3]  
Ns = [10,15,20]

best_silhouette_params = None
best_silhouette_score = -1

best_ch_params = None
best_ch_score = -1

for vector_size in vector_sizes:
    for window in windows:
        for min_count in min_counts:
            for N in Ns:
                model = Word2Vec(sentences=test_data['content_tokens'], vector_size=vector_size, window=window, min_count=min_count, workers=8)
                
                corpus = test_data['content'].tolist()
                vectorizer = TfidfVectorizer()
                tfidf_matrix = vectorizer.fit_transform(corpus)
                word2tfidf = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
            
                test_data['important_words'] = test_data['content_tokens'].apply(
                    lambda tokens: sorted(
                        set(tokens),  # Remove duplicates
                        key=lambda x: word2tfidf.get(x, 0),  # Rank by TF-IDF score
                        reverse=False  # Lower TF-IDF means higher frequency in this case
                    )[:N]
                )
                
                def text_to_vector(text, model, word2tfidf):
                    vectors = []
                    weights = []
                    for word in text:
                        if word in model.wv:
                            vectors.append(model.wv[word])
                            weights.append(word2tfidf.get(word, 1.0))
                    if not vectors:
                        return np.zeros(model.vector_size)
                    vectors = np.array(vectors)
                    weights = np.array(weights) / sum(weights)
                    return np.average(vectors, axis=0, weights=weights)
                
                test_data['vector'] = test_data['important_words'].apply(lambda x: text_to_vector(x, model, word2tfidf))
                
                X = np.array(test_data['vector'].tolist())
                num_clusters = 2
                kmeans = KMeans(n_clusters=num_clusters, random_state=42)
                labels = kmeans.fit_predict(X)
                test_data['cluster_label'] = labels
                
                if len(set(labels)) > 1:
                    silhouette = silhouette_score(X, labels)
                    ch_score = calinski_harabasz_score(X, labels)
                    
                    if silhouette > best_silhouette_score: 
                        best_silhouette_score = silhouette
                        best_silhouette_params = (vector_size, window, min_count, N)
                    
                    if ch_score > best_ch_score:
                        best_ch_score = ch_score
                        best_ch_params = (vector_size, window, min_count, N)
                        
print(f"Best Silhouette parameters: Vector Size={best_silhouette_params[0]}, Window={best_silhouette_params[1]}, Min Count={best_silhouette_params[2]}, N={best_silhouette_params[3]}")
print(f"Best Calinski-Harabasz parameters: Vector Size={best_ch_params[0]}, Window={best_ch_params[1]}, Min Count={best_ch_params[2]}, N={best_ch_params[3]}")

Best Silhouette parameters: Vector Size=70, Window=6, Min Count=3, N=20
Best Calinski-Harabasz parameters: Vector Size=90, Window=8, Min Count=3, N=20
