In [None]:
# !pip install emoji

In [None]:
import pandas as pd
import numpy as np
import os
import re
import string
import nltk
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# !pip uninstall certifi -y
# !pip install --no-cache-dir certifi==2025.7.14

from textacy import preprocessing as tprep
from tqdm import tqdm
tqdm.pandas()


pd.options.display.max_rows = 15


In [None]:
# Article Dataframe
article_df = pd.read_csv('/Users/amalkurian/Desktop/Dissertation/Bias Detection/Deliverables/matching_articles04.csv')
print(f'List all the columns{list(article_df.columns)}')
# Events Datafrane
events_df = pd.read_csv('/Users/amalkurian/Desktop/Dissertation/Bias Detection/diffbot-export-tigray-war.csv')
print(f'List all the columns{list(events_df.columns)}')

In [None]:
article_df['Actions'][0]

In [None]:
article_df['Load_Date'] = pd.to_datetime(article_df['Load_Date'])

In [None]:
import hashlib

def generate_event_id(row):
    raw_String = f"{row['date_str']}_{row['title']}_{row['author']}"
    return hashlib.sha256(raw_String.encode('utf-8')).hexdigest()


events_df['event_id'] = events_df.progress_apply(generate_event_id, axis=1)



In [None]:
events_df.tail()

In [None]:
import spacy
from tqdm.auto import tqdm

nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text) # gets the entities in the text
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Events Dataframe: Extracting Entities using Spacy
# Character count has to be less than 1 million to avoid memory issues

def safe_extract(text):
    if isinstance(text, str) and len(text) < 1_000_000:
        return extract_entities(text)
    return []


events_df['entities'] = events_df['text'].progress_apply(safe_extract)
# Explode the entities For Events Dataframe
events_entities_long = events_df[['event_id', 'entities']].explode('entities')
# # Extract entity text and label
events_entities_long[['entity_text', 'label']] = pd.DataFrame(
    events_entities_long['entities'].tolist(), index=events_entities_long.index
)


In [None]:
events_df.columns

In [None]:
grouped_entities = (events_entities_long
        .groupby('event_id')
        .agg({
            'entity_text': lambda x: list(x),
            'label': lambda x: list(x)
        })
        .rename(columns={
            'entity_text': 'entities_Group', 'label': 'labels_Group'})
        )

events_df = events_df.merge(grouped_entities, on='event_id', how='left')


In [None]:
events_df = events_df.rename(columns = {'label_Group_y': 'label_Group'})

In [None]:
# Format the date extracted into DateTime format %Y-%m-%d

events_df['date_str'] = (
    events_df['date_str']
    .str.replace(r'^d','',regex=True)
)

events_df['date_str'] = events_df['date_str'].str.split('T').str[0] # str is an accessor for string operations on Series
events_df['date_str'] = pd.to_datetime(events_df['date_str'], format='%Y-%m-%d', errors='coerce')
events_df['date_str'].head()

In [None]:
from collections import Counter
import ast

def flatten_entities(col):
    flattened = []
    for ents in col:
        if not isinstance(ents, list):
            continue  # Skip NaNs or weird types
        for item in ents:
            if isinstance(item, (list, tuple)) and len(item) == 2:
                ent, label = item
                if label in {"ORG", "GPE", "PERSON", "NORP", "LOC"}:
                    flattened.append(ent)
    return flattened

# Flatten the entities and count occurances for Article Dataframe   
article_df['entities'] = article_df['entities'].apply(ast.literal_eval) 
all_entities = flatten_entities(article_df['entities'])

# Flatten the entities and count occurrences for Events Dataframe
all_events_entities = flatten_entities(events_df['entities'])

entity_counter = Counter(all_entities)
entity_events_counter = Counter(all_events_entities)

In [None]:
print(article_df['entities'].head(5))

In [None]:
# Raw Entities From Article Dataframe
threshold = 10
raw_ethiopia_entities = ((entity, count) for entity, count in entity_counter.most_common() if count >= threshold)
raw_events_entities = ((entity, count) for entity, count in entity_events_counter.most_common() if count >= threshold)
raw_events_entities = list(raw_events_entities)
raw_ethiopia_entities = list(raw_ethiopia_entities)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

def mapping_events_entities(raw_entities):
    events_entities = list(e[0].lower() for e in raw_entities)
    model = SentenceTransformer('all-MiniLM-L6-v2')

    print('Encoding Entities...') 
    events_embeddings = model.encode(events_entities, show_progress_bar = True)
    events_similarity_matrix = cosine_similarity(events_embeddings)

    clustering = AgglomerativeClustering(
        n_clusters = None,
        metric = 'precomputed',
        linkage = 'complete',
        distance_threshold = 0.3)
    
    events_distance_matrix = 1 - events_similarity_matrix
    events_distance_matrix = events_distance_matrix.astype(np.float64)  # Convert to float32 for HDBSCAN compatibility
    labels = clustering.fit_predict(events_distance_matrix)  # position of the entities in the similarity/distance matrix closer the entity, smaller the distance

    events_cluster_df = pd.DataFrame({'entity': events_entities, 'cluster': labels}) # labels are used to create a list of similar entities and the most common entities will be extracted for the canonical term
    events_canonical_entities = {}

    for cluster_id, group in events_cluster_df.groupby('cluster'):
        # Choose the most common name as canonical
        canonical = Counter(group['entity']).most_common(1)[0][0]
        # canonical = sorted(group['entity'], key=lambda x: len(x))[0]
        for ent in group['entity']:
            events_canonical_entities[ent] = canonical
    
    mapping_events_entities = [(ent, events_canonical_entities[ent]) for ent in events_entities]
    return mapping_events_entities #tuple


# Generate the mapping for events entities
mapping_lookup_events_entities = mapping_events_entities(raw_events_entities)
for original, canonical in mapping_lookup_events_entities[:500]:
    print(f"{original} → {canonical}")

In [None]:

def extract_keywords(text):
    doc = nlp(text)
    keyphrases = set()
    for chunk in doc.noun_chunks:
        if len(chunk.text.split()) > 1:  # Only consider multi-word phrases
            keyphrases.add(chunk.text.lower())

    verbs = {token.lemma_ for token in doc if token.pos_ == 'VERB' and token.lemma_ not in stopwords.words('english')}
    return list(keyphrases), list(verbs)

article_df[['Key_Phrases', 'Actions']] = article_df['cleaned_content'].progress_apply(lambda x: pd.Series(extract_keywords(x) if isinstance(x, str) else ([], [])))


In [None]:
article_df['Actions'][0]

In [None]:


raw_ethiopia_entities

In [None]:
mapping_lookup_articles_entities = mapping_events_entities(raw_ethiopia_entities)
for original, canonical in mapping_lookup_articles_entities[:500]:
    print(f"{original} → {canonical}")

In [None]:
mapped_articles_entities_dictionary = dict(mapping_lookup_articles_entities)
mapped_events_entities_dictionary = dict(mapping_lookup_events_entities)

In [None]:
# Normalize the entities in the articles and events Dataframe
# !pip install rapidfuzz

from rapidfuzz import process

def normalize_entity(ent, entity_mapping, threshold=90): # for spelling mistakes for canonical entities
    if not isinstance(ent, str) or not ent.strip():
        return ent
    match, score, _ = process.extractOne(
        ent.lower(), entity_mapping.keys()
    )

    return entity_mapping[match] if match else ent 


# Events Entities Normalization
normalized_events_entities = [normalize_entity(ent,mapped_events_entities_dictionary) for ent,_ in raw_events_entities]
normalized_events_entities[:50]  # Display first 50 normalized entities
# Normalize the entities in the articles 
normalized_articles_entities = [normalize_entity(ent,mapped_articles_entities_dictionary) for ent,_ in raw_ethiopia_entities]
normalized_articles_entities[:50]  # Display first 50 normalized entities

In [None]:
events_entities_long['normalized'] = events_entities_long.progress_apply(lambda x: normalize_entity(x['entity_text'], mapped_events_entities_dictionary), axis=1)

In [None]:

# Explode the entities
entities_long = article_df[['article_id', 'entities']].explode('entities')

# # Extract entity text and label
entities_long[['entity_text', 'label']] = pd.DataFrame(
    entities_long['entities'].tolist(), index=entities_long.index
    )
entities_long.head()



In [None]:
entities_long['normalized'] = entities_long.progress_apply(lambda x: normalize_entity(x['entity_text'], mapped_articles_entities_dictionary), axis = 1)

In [None]:
actor_types = ['PERSON', 'ORG', 'NORP']
geo_types = ['GPE', 'LOC']
actor_df = entities_long[entities_long['label'].isin(actor_types)] \
    .groupby('article_id')['normalized'].apply(set).reset_index(name='actor_entities')
geo_df = entities_long[entities_long['label'].isin(geo_types)] \
    .groupby('article_id')['normalized'].apply(set).reset_index(name='geo_entities')
# Merge actor and geo entities
merged_df = pd.merge(actor_df, geo_df, on='article_id', how='outer')
# Merge cleanly
article_df = article_df.merge(merged_df, on='article_id', how='left')


In [None]:
actor_types = ['PERSON', 'ORG', 'NORP']
geo_types = ['GPE', 'LOC']

actor_df = events_entities_long[events_entities_long['label'].isin(actor_types)] \
    .groupby('event_id')['normalized'].apply(set).reset_index(name='actor_entities')

geo_df = events_entities_long[events_entities_long['label'].isin(geo_types)] \
    .groupby('event_id')['normalized'].apply(set).reset_index(name='geo_entities')

# Merge actor and geo entities
merged_df = pd.merge(actor_df, geo_df, on='event_id', how='outer')


# Merge cleanly
events_df = events_df.merge(merged_df, on='event_id', how='left')


In [None]:
article_df.columns

In [None]:
events_df.columns 


In [None]:
# Filter events for each article based on the date range

from datetime import timedelta


def filter_events_for_article(Article_date):
    
    lower = Article_date - timedelta(days=27)
    upper = Article_date + timedelta(days=7) 
    filtered = events_df[events_df['date_str'].between(lower, upper)] # it is a DataFrame
    return filtered['event_id'].tolist()


article_df['Load_Date'] = pd.to_datetime(article_df['Load_Date'])
article_df['Load_Date'].head()

article_df['matching_events'] = article_df['Load_Date'].progress_apply(filter_events_for_article)


In [None]:
events_df.columns

In [None]:
article_df.columns

# Entity Similarity Scores

In [None]:
def safe_set(x):
    if isinstance(x, set):
        return x
    if isinstance(x, list):
        return set(x)
    if pd.isna(x) or not x:
        return set()
    if isinstance(x, str):
        try:
            # Safely parse string literal like "{a, b, c}" into set
            parsed = ast.literal_eval(x)
            # parsed might be set, list, tuple — convert to set
            return set(parsed)
        except (ValueError, SyntaxError):
            # Fallback: split by commas if parsing fails
            return set(e.strip() for e in x.strip('{}').split(',') if e.strip())
    return set([x])

def compute_entity_similarity(article_actors, article_geos, event_actors, event_geos):
    # Sanitize inputs
    article_actors = safe_set(article_actors)
    event_actors = safe_set(event_actors)
    article_geos = safe_set(article_geos)
    event_geos = safe_set(event_geos)
    
    actor_score = (
        len(article_actors & event_actors) / len(event_actors | article_actors)
        if event_actors else 0
    )
    geo_score = (
        len(article_geos & event_geos) / len(event_geos | article_geos)
        if event_geos else 0
    )
    return actor_score, geo_score

similarity_scores = []

# Convert events_df into a quick lookup dict for speed
event_lookup = {
    row['event_id']: row
    for _, row in events_df.iterrows()
}

for _, article_row in article_df.iterrows():
    article_id = article_row['article_id']
    a_actors = article_row['actor_entities']
    a_geos = article_row['geo_entities']
    
    # Get pre-filtered event IDs for this article
    matching_events = article_row.get('matching_events', [])

    for event_id in matching_events:
        # Defensive check: skip if event_id not in lookup
        
        if event_id not in event_lookup:
            continue

        event_row = event_lookup[event_id]
        e_actors = event_row['actor_entities']
        e_geos = event_row['geo_entities']
        
        # Compute similarity
        actor_score, geo_score = compute_entity_similarity(a_actors, a_geos, e_actors, e_geos)
        print(actor_score)
        similarity_scores.append({
            'article_id': article_id,
            'event_id': event_id,
            'actor_score': actor_score,
            'geo_score': geo_score,
            'combined_score': (actor_score + geo_score) / 2  # or weighted
        })



In [None]:
to_find_combined_score = pd.DataFrame(similarity_scores)

In [None]:
to_find_combined_score

In [None]:
# With Cosine Similarity

from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize
import nltk
import torch
import pandas as pd

#nltk.download('punkt')

model = SentenceTransformer("BAAI/bge-base-en-v1.5")  # or 'bge-large-en-v1.5'

# article_df['cleaned_content']
# article_df['matching_events'] = list of event_ids within same time window
# events_df['event_id'], events_df['text']


# Event embedding index (for fast lookup)
events_df['title'] = events_df['title'].fillna('').astype(str)
events_id_to_vector = {
    row['event_id']: model.encode(row['title'], convert_to_tensor=True, batch_size=32, show_progress_bar=False)
    for _, row in events_df.iterrows()
}

# --- Function to embed and average article sentences
def embed_article_sentences(text):
    sentences = sent_tokenize(text)
    if not sentences:
        return torch.zeros(model.get_sentence_embedding_dimension())
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True, batch_size=32, show_progress_bar=False)
    return torch.mean(sentence_embeddings, dim=0) # gets you a single embedding for the whole paragraph by finding the mean

# compare only against its matching_events
article_match_scores = []
article_match_ids = []

for idx, row in article_df.iterrows():
    article_vec = embed_article_sentences(row.title)

    matching_event_ids = row.get('matching_events', [])
    if not matching_event_ids:
        article_match_scores.append(None)
        article_match_ids.append(None)
        continue

    # Get event vectors for those in the matching time window
    event_vecs = [events_id_to_vector[eid] for eid in matching_event_ids if eid in events_id_to_vector]
    
    if not event_vecs:
        article_match_scores.append(None)
        article_match_ids.append(None)
        continue

    event_tensor = torch.stack(event_vecs) # create a 2D matrix for cosine similarity, it won't work on a list

    # Cosine similarity
    sim_scores = util.cos_sim(article_vec, event_tensor).squeeze(0)  # Shape: [num_events]
    
    scores_list = sim_scores.tolist()

    article_match_scores.append(scores_list)
    article_match_ids.append(matching_event_ids)

# Attach to DataFrame
article_df['match_score_Title'] = article_match_scores
article_df['event_similarity_id_Title'] = article_match_ids


In [None]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Fill NaNs
# article_df['title'] = article_df['title'].fillna('').astype(str)
# events_df['title'] = events_df['title'].fillna('').astype(str)

# # Keep mapping from event_id to title
# event_id_to_title = events_df.set_index('event_id')['title'].to_dict()

# # TF-IDF vectorizer
# vectorizer = TfidfVectorizer(stop_words='english')

# # Combine all relevant titles for fitting
# all_titles = list(article_df['title'].unique()) + list(events_df['title'].unique())
# vectorizer.fit(all_titles)

# # 1️⃣ Embed article titles
# article_tfidf = vectorizer.transform(article_df['title'].tolist())

# # Prepare storage
# match_scores = []
# match_ids = []

# # 2️⃣ Loop over articles
# for idx, row in article_df.iterrows():
#     matching_event_ids = row.get('matching_events', [])
#     if not matching_event_ids:
#         match_scores.append(None)
#         match_ids.append(None)
#         continue

#     # Get titles for the matching events
#     event_titles = [event_id_to_title[eid] for eid in matching_event_ids if eid in event_id_to_title]
#     if not event_titles:
#         match_scores.append(None)
#         match_ids.append(None)
#         continue

#     event_tfidf = vectorizer.transform(event_titles)

#     # Cosine similarity
#     article_vec = article_tfidf[idx]
#     sim_scores = cosine_similarity(article_vec, event_tfidf).flatten()
    
#     match_scores.append(sim_scores.tolist())
#     match_ids.append([matching_event_ids[i] for i, _ in enumerate(event_titles)])

# # Attach to DataFrame
# article_df['match_score_Title'] = match_scores
# article_df['event_similarity_id_Title'] = match_ids

# print(article_df.head())


In [None]:
# article_df = article_df.drop(columns = {'match_score_Title', 'event_similarity_id_Title'})

In [None]:
article_df.columns

In [None]:
# Explode article_df lists into one row per event_id and score
exploded = article_df.explode(['event_similarity_id_Title', 'match_score_Title']).rename(columns={
    'event_similarity_id_Title': 'event_id',
    'match_score_Title': 'match_score'
})


In [None]:
to_find_combined_score

In [None]:
# Explode the lists into rows for clean merging
exploded_article = article_df.explode(['event_similarity_id_Title', 'match_score_Title']).rename(columns={
    'event_similarity_id_Title': 'event_id_2',
    'match_score_Title': 'match_score'
})

In [None]:
# exploded_article = exploded_article.rename(columns = {
#     'event_id': 'event_id_old',
# })

# exploded_article.columns

In [None]:
exploded_article = exploded_article.rename(columns = {
    'event_id_2': 'event_id',
})

exploded_article.columns

In [None]:
# # Keep only the first occurrence of each column name
# exploded_article = exploded_article.loc[:, exploded_article.columns.duplicated()]

# print(exploded_article.head())



In [None]:
to_find_combined_score.columns

In [None]:
cols = exploded_article.columns.tolist()
first_idx = cols.index('match_score')
# Rename the first one
cols[first_idx] = 'match_score_old'
exploded_article.columns = cols


In [None]:


# Merge on both article_id and event_id
merged_df = pd.merge(
    to_find_combined_score,
    exploded_article[['article_id', 'event_id', 'match_score']],
    on=['article_id', 'event_id'],
    how='left'
)

print(merged_df.head(2))


In [None]:
exploded_article.columns

In [None]:
merged_df

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import math

# Convert to datetime if needed
article_df['Load_Date'] = pd.to_datetime(article_df['Load_Date'])
events_df['date_str'] = pd.to_datetime(events_df['date_str'])

# Params from the paper
alpha_h = 1.0  # for historical
alpha_p = 0.8  # for predictive
lambda_ = 0.8
max_past_days = 30
max_future_days = 10

# Convert events_df to a dict for fast lookup
event_date_lookup = dict(zip(events_df['event_id'], events_df['date_str']))

# Function to calculate temporal proximity 
# Distance from 0 is considered over negative or positive value
def compute_exp_temporal_score(pub_date, event_date):
    delta_days = (event_date - pub_date).days  # Positive if event after article (predictive)
    
    if delta_days > 0:
        # Predictive reporting Kind of optional categorization in this context
        decay = delta_days / max_future_days
        score = alpha_p * np.exp(-lambda_ * decay)
    else:
        # Historical reporting
        decay = abs(delta_days) / max_past_days
        score = alpha_h * np.exp(-lambda_ * decay)
    
    return round(score, 4)

def compute_log_temporal_score(pub_date, event_date):
    delta_days = abs((event_date - pub_date).days)

    if delta_days > 0:
        H = 27
    else:
        H = 7

    value = (delta_days + 1)/H
    inner = math.log(value)
    if inner <= 0.0:
        return 0.0
    
    score = -math.log(H * inner) / math.log(H)
    return round(score, 4)

# Apply to all article-event pairs
def compute_temporal_scores(row):
    pub_date = row['Load_Date']
    matching_events = row.get('matching_events', [])

    exp_scores = {}
    log_scores = {}

    for eid in matching_events:
        event_date = event_date_lookup.get(eid)

        if pd.notna(event_date):
            exp_score = compute_exp_temporal_score(pub_date, event_date)
            exp_scores[eid] = exp_score

            log_score = compute_log_temporal_score(pub_date, event_date)
            log_scores[eid] = log_score

    return {'exp': exp_scores, 'log': log_scores}

# Add temporal proximity scores to article_df
article_df['temporal_scores'] = article_df.apply(compute_temporal_scores, axis=1)


In [None]:
merged_df.columns

In [None]:
# Step 1: Merge the temporal_scores into the main df
merged_with_temp = merged_df.merge(article_df[['article_id', 'temporal_scores']], on='article_id', how='left')



In [None]:
def extract_exp(row):
    exp_dict = row['temporal_scores'].get('exp', {})
    return exp_dict.get(row['event_id'], np.nan)

merged_with_temp['temporal_score_exp'] = merged_with_temp.apply(extract_exp, axis=1)

In [None]:
merged_with_temp = merged_with_temp.drop(columns=['temporal_scores'])
merged_with_temp

In [None]:
merged_with_temp['temporal_score_exp'].max()

In [None]:
merged_with_temp.columns

In [None]:
import itertools


def normalize_score(score, min_val, max_val):
    return (score - min_val) / (max_val - min_val) if max_val > min_val else 0

def combined_score(row, weights, min_vals, max_vals):
    entity_ = normalize_score(row.get('combined_score', 0), min_vals['entity_similarity'], max_vals['entity_similarity'])
    temporal_ = normalize_score(row.get('temporal_score_exp', 0), min_vals['temporal_exp_score'], max_vals['temporal_exp_score'])
    semantic_ = normalize_score(row.get('match_score', 0), min_vals['text_similarity'], max_vals['text_similarity'])
    
    combined = (
        weights['semantic'] * semantic_ + 
        weights['entity'] * entity_ +
        weights['temporal'] * temporal_
    )
    return combined


def classify_match(score, threshold=0.45):
    return "Valid" if score >= threshold else "Invalid"


weights = {
    'semantic': 0.4,
    'entity': 0.4,
    'temporal': 0.2
}

cols_to_numeric = ['match_score', 'combined_score', 'temporal_score_exp']
for col in cols_to_numeric:
    merged_with_temp[col] = pd.to_numeric(merged_with_temp[col], errors='coerce')


min_vals = {
    'text_similarity': merged_with_temp['match_score'].min(),
    'entity_similarity': merged_with_temp['combined_score'].min(),
    'temporal_exp_score': merged_with_temp['temporal_score_exp'].min()
}

max_vals = {
    'text_similarity': merged_with_temp['match_score'].max(),
    'entity_similarity': merged_with_temp['combined_score'].max(),
    'temporal_exp_score': merged_with_temp['temporal_score_exp'].max()
}

merged_with_temp['final_scores'] = merged_with_temp.progress_apply(
    lambda row: combined_score(row, weights, min_vals, max_vals), axis=1)

merged_with_temp['match_Labels'] = merged_with_temp['final_scores'].progress_apply(classify_match)

In [None]:
# For each article_id, find the max final_scores
max_scores = merged_with_temp.groupby('article_id')['final_scores'].transform('max')

# Filter rows where final_scores == max for that article_id
filtered_df = merged_with_temp[merged_with_temp['final_scores'] == max_scores].copy()
filtered_df

In [None]:
filtered_df['match_Labels'].value_counts()

In [None]:
def deduplicate_keep_order(lst):
    seen = set()
    deduped = []
    for item in lst:
        if item not in seen:
            seen.add(item)
            deduped.append(item)
    return deduped

combined_df = filtered_df.groupby('article_id').agg({
    'event_id': lambda x: deduplicate_keep_order(list(x)),
    'actor_score': lambda x: deduplicate_keep_order(list(x)),
    'geo_score': lambda x: deduplicate_keep_order(list(x)),
    'combined_score': lambda x: deduplicate_keep_order(list(x)),
    'match_score': lambda x: deduplicate_keep_order(list(x)),
    'temporal_score_exp': lambda x: deduplicate_keep_order(list(x)),
    'final_scores': lambda x: deduplicate_keep_order(list(x)),
    'match_Labels': lambda x: deduplicate_keep_order(list(x))
}).reset_index()



In [None]:
combined_df.columns

In [None]:
combined_df['match_Labels'].value_counts()

In [None]:
combined_df.columns

In [None]:
# Merge and bring over match_labels and any other columns from combined_df
article_df_merged = article_df.merge(combined_df, on='article_id', how='left')
# Fill 'match_labels' with 'Invalid' where it's NaN (i.e., no match from combined_df)
article_df_merged['match_Labels'] = article_df_merged['match_Labels'].fillna('Invalid')
article_df_merged['match_Labels'] = article_df_merged['match_Labels'].astype(str).str.strip("[]").str.strip("'")

In [None]:
article_df_merged.columns

In [None]:
article_df_merged  = article_df_merged.drop(columns=['match_Labels_x',
       'actor_entities_y', 'geo_entities_y', 'event_id_x', 'actor_score_x',
       'geo_score_x', 'combined_score_x', 'match_score_x',
       'temporal_score_exp_x', 'final_scores_y', 'match_Labels_y'
])

article_df_merged.columns

In [None]:
article_df_merged['match_Labels'].value_counts()

In [None]:
article_df_merged.to_csv("article_event_matches_mergedIII.csv", index=False)

In [None]:
events_df.to_csv("events03_df.csv", index=False)

In [None]:
# article_df = pd.read_csv('article_event_matches_mergedII.csv')
article_df.columns

In [None]:
article_df = (
    article_df.rename(columns={c: c[:-2] for c in article_df.columns if c.endswith("_y")})
)

In [None]:
article_df_merged.columns

In [None]:
events_df = pd.read_csv('events03_df.csv')
events_df.columns

In [None]:
article_df_merged['event_id_y'][0]

In [None]:
# def get_event_id_from_max_index(row):
#     idx = int(row['max_score_index'])
#     events_ids = row['event_similarity_id']
#     if events_ids and isinstance(events_ids, list) and  0 <= idx < len(events_ids):
#         return events_ids[idx]
#     else:
#         return None

# article_df_merged['Top_event_id'] = article_df_merged.apply(get_event_id_from_max_index, axis=1)
# # Create the mapping dictionary once
event_id_to_title = dict(zip(events_df['event_id'], events_df['title']))

# Define a function that maps only if match_label is Valid
def map_titles_if_valid(row):
    if row['match_Labels'] == 'Valid' and isinstance(row['event_id_y'], list):
        return [event_id_to_title.get(eid) for eid in row['event_id_y']]
    return None

article_df_merged['Top_event_titles'] = article_df_merged.apply(map_titles_if_valid, axis=1)



In [None]:
article_df_merged