In [13]:
import pandas as pd
import numpy as np
import os
import re
import json
import string
import nltk
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textacy import preprocessing as tprep
from tqdm import tqdm
tqdm.pandas()


pd.options.display.max_rows = 15


In [14]:
# Article Dataframe
article_df = pd.read_csv('/Users/amalkurian/Desktop/Dissertation/Bias Detection/Data/matching_articles04.csv')
print(f'List all the columns{list(article_df.columns)}')

List all the columns['doc_id', 'doc_id.1', 'doc_id.2', 'doc_id.3', 'doc_id.4', 'title', 'author', 'source', 'content', 'topic', 'url', 'keywords', 'query', 'cleaned_content', 'entities', 'article_id', 'entities_Group', 'labels_Group', 'Key_Phrases', 'Actions', 'Load_Date', 'Language', 'Person', 'matching_events', 'character_count', 'word_count', 'processed_text', 'processed_tokens_split', 'num_tokens', 'story_chain', 'match_score_faiss', 'temporal_scores', 'temporal_exp_score', 'temporal_log_score', 'match_score_Title', 'event_similarity_id_Title', 'match_label_x', 'match_label_y', 'match_label', 'actor_entities', 'geo_entities', 'entity_similarity', 'final_scores', 'max_score_index', 'match_Labels', 'Top_event_id', 'Top_event_title']


In [15]:
# Events Datafrane
events_df = pd.read_csv('/Users/amalkurian/Desktop/Dissertation/Bias Detection/Data/timeline_of_tirgay_conflict/Sheet1-Table 1.csv')
print(f'List all the columns{list(events_df.columns)}')

List all the columns['Date', 'Major events', 'Details', 'NYT N1 (Titles)', 'NYT N2', 'NYT N3', 'Label', 'Indicator']


In [16]:
events_df.columns

Index(['Date', 'Major events', 'Details', 'NYT N1 (Titles)', 'NYT N2',
       'NYT N3', 'Label', 'Indicator'],
      dtype='object')

In [17]:
# Event ID
import hashlib

def encoded_event_id(row):
    raw_string = f"{row['Date']}+{row.name}"
    encoded_id = hashlib.sha256(raw_string.encode('utf-8'))
    return (encoded_id).hexdigest()

events_df['event_id'] = events_df.progress_apply(encoded_event_id, axis=1) # applying row by row 

100%|██████████| 760/760 [00:00<00:00, 16757.46it/s]


In [18]:
events_df['Date'] = pd.to_datetime(events_df['Date'], format='%A, %B %d, %Y').dt.strftime('%Y-%m-%d')
events_df['Date'].head()

0    2020-11-01
1    2020-11-02
2    2020-11-03
3    2020-11-04
4    2020-11-05
Name: Date, dtype: object

In [19]:
events_df['Date'] = pd.to_datetime(events_df['Date'], format='%Y-%m-%d')

In [20]:
#!python -m spacy download en_core_web_trf


In [21]:
# !pip install spacy-transformers
# !pip install --force-reinstall certifi


In [48]:

import spacy

nlp = spacy.load('en_core_web_trf')
ruler = nlp.add_pipe("entity_ruler", before = "ner")
patterns = [{"label": "ORG", "pattern": "TPLF"}, {"label": "ORG", "pattern": "GOVT"}, {"label": "ORG", "pattern": "ENDF"}]
ruler.add_patterns(patterns)

def extract_entities(text):
    doc = nlp(text)
    entity_texts = [ent.text for ent in doc.ents]
    entity_labels = [ent.label_ for ent in doc.ents]
    return entity_texts, entity_labels


def safe_text(text):
    if isinstance(text, str):
        return extract_entities(text)
    return ([], [])


events_df[['events_entities', 'entities_labels']] = events_df['Major events'].progress_apply(safe_text).apply(pd.Series) # returns a tuple convert it into a SERIES for seperate columns

100%|██████████| 760/760 [00:05<00:00, 145.75it/s]


In [49]:
def extract_keywords(text):
    if not isinstance(text, str):
        return [], []
    
    doc = nlp(text)
    keyphrases = set()
    for chunk in doc.noun_chunks: # Nouns in the text
        if len(chunk.text.split())>1:
            keyphrases.add(chunk.text.lower())

    verbs = {token.lemma_ for token in doc if token.pos == 'VERB' and token.lemma_ not in stopwords.words('english')}
    return list(keyphrases), list(verbs)

events_df[['KeyPhrases', 'Actions']] = events_df['Major events'].progress_apply(extract_keywords).apply(pd.Series)

100%|██████████| 760/760 [00:06<00:00, 120.30it/s]


In [50]:
# Extract the matching articles for the said events 

from datetime import timedelta

#events_df['Date'] = pd.to_datetime(events_df['Date'])
article_df['Load_Date'] = pd.to_datetime(article_df['Load_Date'])

def filter_articles_for_events(events_date):
    lower_date = events_date - timedelta(days=27)
    upper_date = events_date + timedelta(days=7)
    
    filtered_articles = article_df[article_df['Load_Date'].between(lower_date, upper_date)]
    return filtered_articles['article_id'].tolist()

mask = events_df['Major events'].notna() # non empty rows only

events_df.loc[mask, 'matching_articles'] = events_df.loc[mask, 'Date'].progress_apply(filter_articles_for_events) # mask is the Major Events loc accesses the respective Dates

100%|██████████| 31/31 [00:01<00:00, 30.62it/s]


In [51]:
from itertools import chain

raw_entities_in_events = list(chain.from_iterable(events_df['events_entities']))
raw_entities_in_events 

['TPLF',
 'Northern Command',
 'Govt',
 'Abiy',
 'Tigray',
 'Ethiopian',
 'Tigray',
 'ENDF',
 'Ethiopian National Defence Forces',
 'Mekelle',
 'Tigray',
 'Federal',
 'TPLF',
 'ENDF',
 'Tigray',
 'Western',
 'Wolkayit',
 'Tigray',
 'TPLF',
 'Mekelle',
 'TPLF',
 'Tigray',
 'Alamata',
 'Korem',
 'Kobo',
 'Amhara',
 'TPLF',
 'TPLF',
 'Dessie',
 'Amhara Region',
 'TPLF',
 'Kombolcha',
 'TPLF',
 'Oromo Liberation Army',
 'Shewa',
 'around 190 km',
 'Addis Ababa',
 'Federal Government',
 '6-month',
 'Abiy',
 'ENDF',
 'Chifra',
 'Afar Region',
 'ENDF',
 'Shewa Robit',
 '220 k',
 'Addis Ababa',
 'ENDF',
 'Dessie',
 'Kombolcha',
 'Legesse Tulu',
 'Government Communication Service',
 'TPLF',
 'Weldiya',
 'Kobo',
 'Lalibela',
 'Hara',
 'Robit',
 'Sanqa',
 'Sirinka',
 'Hamusit',
 'Estaysh',
 'Ahun Tegen',
 'Dilb',
 'Kul Mesk',
 'Ethiopian',
 'Tigray',
 'Ethiopian',
 'TPLF',
 'Federal Government',
 'TPLF',
 'TPLF',
 'Kobo',
 'Ethiopia',
 'Eritrea',
 'Sheraro',
 'Ethiopian-Eritrean',
 'TPLF',
 'Nort

In [52]:
# a Lookup dictionary for Normalization of events entities

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from collections import Counter

def mapping_entity_events(raw_entities_in_events):
    entities_in_events = [e.lower() for e in raw_entities_in_events]
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # Encode the events entities

    events_embeddings = model.encode(entities_in_events, show_progress_bar = True)
    events_similarity_matrix = cosine_similarity(events_embeddings)

    clustering = AgglomerativeClustering(
    n_clusters = None,
    metric = 'precomputed',
    linkage = 'complete',
    distance_threshold = 0.3)

    events_distance_matrix = 1 - events_similarity_matrix
    events_distance_matrix = events_distance_matrix.astype(np.float64)
    labels = clustering.fit_predict(events_distance_matrix)

    events_cluster_df = pd.DataFrame({'entity': entities_in_events, 'cluster': labels})
    canonical_lookup_dict = {}
    for cluster_id, group in events_cluster_df.groupby('cluster'):
        canonical = Counter(group['entity']).most_common(1)[0][0]
        for ent in group['entity']:
            canonical_lookup_dict[ent] = canonical

    return canonical_lookup_dict

mapping_entities_to_canonical = mapping_entity_events(raw_entities_in_events=raw_entities_in_events)


Batches: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


In [53]:
mapping_entities_to_canonical['sirinka']='sri lanka'

In [54]:
mapping_entities_to_canonical

{'kobo': 'kobo',
 'kombolcha': 'kobo',
 'pretoria': 'pretoria',
 'south africa': 'pretoria',
 'shewa': 'shewa',
 'shewa robit': 'shewa',
 'ethiopian': 'ethiopian',
 'ethiopia': 'ethiopian',
 'amhara': 'amhara',
 'amhara region': 'amhara',
 'eritrea': 'eritrea',
 'ethiopian-eritrean': 'eritrea',
 'dessie': 'dessie',
 'mekelle': 'mekelle',
 'federal': 'federal government',
 'federal government': 'federal government',
 'african union': 'african union',
 'abiy': 'abiy',
 'tigray': 'tigray',
 'almata': 'almata',
 'sanqa': 'sanqa',
 'ahun tegen': 'ahun tegen',
 'au': 'au',
 'sirinka': 'sri lanka',
 'endf': 'endf',
 'tplf': 'tplf',
 'adwa': 'adwa',
 'hamusit': 'hamusit',
 'korem': 'korem',
 'shire': 'shire',
 'kul mesk': 'kul mesk',
 'oromo liberation army': 'oromo liberation army',
 'chifra': 'chifra',
 'western': 'western',
 'afar region': 'afar region',
 'lalibela': 'lalibela',
 'estaysh': 'estaysh',
 'robit': 'robit',
 'axum': 'axum',
 'legesse tulu': 'legesse tulu',
 '220 k': '220 k',
 '

In [55]:
from rapidfuzz import process

def normalize_entity(ent, entity_to_mapping, threshold=90):
    if not isinstance(ent, str):
        return ent
    match, score, _ = process.extractOne(
        ent.lower(), entity_to_mapping.keys() # validate the similarity between the ent and the dict keys() 
    )
    return entity_to_mapping[match] if match and score >= threshold else ent # checks whether the similarity score is >= 90 for assigning the resp canonical value


def normalize_entities_list(entities_list, entity_mapping):
    normalized = []
    for ent in entities_list:
        normalized_ent = normalize_entity(ent, entity_mapping)
        normalized.append(normalized_ent)
    return normalized


events_df['normalized_entities'] = events_df['events_entities'].apply(lambda ents: normalize_entities_list(ents, mapping_entities_to_canonical))


In [56]:
events_df['Combined_entities'] = events_df.apply(lambda row: (row['normalized_entities'] or []) + (row['KeyPhrases'] or []), axis=1)

In [57]:
def split_entities(entities, labels):
    actors = []
    geos = []
    for ent, lab in zip(entities, labels):
        if lab not in {'GPE', 'LOC'}:
            actors.append(ent)
        elif lab in {'GPE', 'LOC'}:
            geos.append(ent)

    return actors, geos

# pd.Series converts the list that is returned from the method to a Series format
events_df[['actor_entities', 'geo_entities']] = events_df.apply(lambda row: pd.Series(split_entities(row['normalized_entities'], row['entities_labels'])), axis=1)

In [60]:
import ast
def safe_literal_eval(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    try:
        return ast.literal_eval(x)
    except Exception:
        return []

article_df['actor_entities'] = article_df['actor_entities'].apply(safe_literal_eval)


In [62]:

article_df['geo_entities'] = article_df['geo_entities'].apply(safe_literal_eval)

In [63]:
# convert to list
def force_list(obj):
    if isinstance(obj, set):
        return list(obj)
    elif isinstance(obj, list):
        return obj
    else:
        return []

article_df['actor_entities'] = article_df['actor_entities'].apply(force_list)

In [66]:

article_df['Key_Phrases'] = article_df['Key_Phrases'].apply(safe_literal_eval)


def clean_key_phrases(lst):
    if not isinstance(lst, list):
        return lst
    return [phrase.replace('\n', ' ').strip() for phrase in lst]

article_df['Key_Phrases'] = article_df['Key_Phrases'].apply(clean_key_phrases)


In [67]:
def combined_entities_list(row):
    combined = []
    for col in ['actor_entities', 'geo_entities']:
        val = row[col]
        if isinstance(val, list):
            combined.extend(val)

    return combined

article_df['Combined_entities'] = article_df.progress_apply(combined_entities_list, axis=1)

100%|██████████| 3322/3322 [00:00<00:00, 20762.89it/s]


In [68]:
events_df['Combined_entities'] = events_df.progress_apply(combined_entities_list, axis=1)

  0%|          | 0/760 [00:00<?, ?it/s]

100%|██████████| 760/760 [00:00<00:00, 17891.98it/s]


In [69]:
events_df.columns

Index(['Date', 'Major events', 'Details', 'NYT N1 (Titles)', 'NYT N2',
       'NYT N3', 'Label', 'Indicator', 'event_id', 'events_entities',
       'entities_labels', 'KeyPhrases', 'Actions', 'matching_articles',
       'normalized_entities', 'Combined_entities', 'actor_entities',
       'geo_entities'],
      dtype='object')

# Entity Similarity Score

In [70]:
def entity_overlap_score(event_text, candidate_text):
    event_text = set(event_text)
    candidate_text = set(candidate_text)
    if not event_text or not candidate_text:
        return 0
    return len(event_text & candidate_text) / len(event_text | candidate_text)

In [71]:
text_a = ' '.join(article_df['Combined_entities'][2408]) if isinstance(article_df['Combined_entities'][2408], list) else str(article_df['Combined_entities'][2408])
text_a

'tplf tigray the government of ethiopia apo group government international relations sudan ai east load-date'

In [72]:

matches = []

for idx_a, row_a in events_df.iterrows():
    text_a = ' '.join(row_a['Combined_entities']) if isinstance(row_a['Combined_entities'], list) else str(row_a['Combined_entities'])
    candidate_article_ids = row_a.get('matching_articles', [])
    
    if not isinstance(candidate_article_ids, list) or not candidate_article_ids:
        continue  
    
    # Filter article_df to only candidates in this list
    candidates = article_df[article_df['article_id'].isin(candidate_article_ids)].copy() # dataframe in one row of events_df that was passed
    
    if candidates.empty:
        continue  # Defensive check
    
    # Prepare choices for rapidfuzz (list of texts)
    # candidates['combined_entities_keyphrases'] is the whole dataframe that was present in the matching_articles
    candidates['joined_entities'] = candidates['Combined_entities'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x)).tolist() # to list is to connect a bunch of strings into a list
    
    candidates['similarity_score'] = candidates['joined_entities'].apply(
        lambda x: entity_overlap_score(text_a, x)
    )



    # best_match = process.extractOne(
    #     query=text_a, # query is 1 string
    #     choices=choices, # choices are many strings
    #     scorer=fuzz.token_set_ratio
    # )
    
    # Collect all event-article matches (not just best)
    for art_idx, art_row in candidates.iterrows():
        matches.append({
            'event_index': row_a['event_id'],
            'event_title': row_a.get('Major events', ''),
            'article_index': art_idx,
            'article_id': art_row['article_id'],
            'article_title': art_row.get('title', ''),
            'similarity_score': art_row['similarity_score']
        })

match_df = pd.DataFrame(matches)

In [73]:
match_df = pd.DataFrame(matches)

In [74]:
match_df

Unnamed: 0,event_index,event_title,article_index,article_id,article_title,similarity_score
0,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),4,4,ethiopia pm orders riposte after 'attack' on a...,0.615385
1,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),6,6,african bloc wants ceasefire as ethiopia 's ti...,0.551724
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),7,7,"ethiopia 's tigray conflict worsens, refugees ...",0.551724
3,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),121,121,ethiopia pm downplays war fears while hundreds...,0.571429
4,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),147,147,ethiopian army vows to conclude tigray offensi...,0.615385
...,...,...,...,...,...,...
5598,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3189,3189,"treasonous, barbaric attacks on endf exposed a...",0.500000
5599,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3217,3217,ethiopia 's tplf rebels agree to disarm,0.407407
5600,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3221,3221,ethiopia : tplf fighters advised to move to li...,0.478261
5601,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3271,3271,confusion prevails over withdrawal of eritrean...,0.407407


# Semantic Similarity Calculation

In [75]:
match_df['event_index'][0]

'22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b6492b620c9387ca9443c'

In [76]:
filtered = match_df[
    (match_df['article_id'] == 4) & 
    (match_df['event_index'] == '22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b6492b620c9387ca9443c')
]

total_count = len(filtered)
duplicate_count = filtered.duplicated().sum()

print(f"Total occurrences of article_id == 4: {total_count}")
print(f"Number of duplicate rows for article_id == 4: {duplicate_count}")


Total occurrences of article_id == 4: 1
Number of duplicate rows for article_id == 4: 0


# Similarity Score Title

In [77]:
from nltk.tokenize import sent_tokenize
import torch
from sentence_transformers import util


model = SentenceTransformer('BAAI/bge-base-en-v1.5')

# convert the article_title to a vector for further processing by creating a lookup dictionary
article_id_to_vector_title = {
    row['article_id']: model.encode(row['title'], convert_to_tensor=True, batch_size=64, show_progress_bar=True)
    for _, row in article_df.iterrows()}

print('Article_Title Vectorization Completed...\n')

def embed_events_texts(text):
    if not isinstance(text, str):
        return torch.zeros(model.get_sentence_embedding_dimension())
    sentences = sent_tokenize(text)
    if not sentences:
        return torch.zeros(model.get_sentence_embedding_dimension())
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True, batch_size=32, show_progress_bar=False)
    return torch.mean(sentence_embeddings, dim=0) # gets you a single embedding for the whole paragraph by finding the mean

article_match_scores = []
article_match_ids = []
scores_list = []

for idx, row in events_df.iterrows():
    events_embeddings = embed_events_texts(row['Major events'])


    matching_article_ids = row.get('matching_articles', [])

    if not isinstance(matching_article_ids, list) or not matching_article_ids:
        article_match_scores.append(None)
        article_match_ids.append(None)
        continue

    article_matched_vectors_1D = [article_id_to_vector_title[aid] for aid in matching_article_ids if aid in article_id_to_vector_title]

    if not article_matched_vectors_1D:
        article_match_scores.append(None)
        article_match_ids.append(None)
        continue

    article_matched_vector_mtx = torch.stack(article_matched_vectors_1D)

    sim_scores = util.cos_sim(events_embeddings, article_matched_vector_mtx).squeeze(0)

    scores_list = sim_scores.tolist()

    article_match_scores.append(scores_list)
    article_match_ids.append(matching_article_ids)


events_df['similarity_scores_Title'] = article_match_scores
events_df['article_sim_id_Title'] = article_match_ids


    


Batches: 100%|██████████| 1/1 [00:03<00:00,  3.39s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.75it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.08it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.64it/s]
Batches: 1

Article_Title Vectorization Completed...



In [78]:
exploded_article = events_df.explode(['similarity_scores_Title', 'article_sim_id_Title'])

In [80]:
exploded_article

Unnamed: 0,Date,Major events,Details,NYT N1 (Titles),NYT N2,NYT N3,Label,Indicator,event_id,events_entities,entities_labels,KeyPhrases,Actions,matching_articles,normalized_entities,Combined_entities,actor_entities,geo_entities,similarity_scores_Title,article_sim_id_Title
0,2020-11-01,,,,,,,,7ab68b158e4a104599df68a6818f7e478e2bb1e7c02934...,[],[],[],[],,[],[],[],[],,
1,2020-11-02,,,,,,,,62483e46a81b946a2527bec808a423a151e39976337399...,[],[],[],[],,[],[],[],[],,
2,2020-11-03,TPLF (rebel) attacked Northern Command (Govt.),,Dozens Killed in Ethiopia in Schoolyard Massacre,,,,,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,"[TPLF, Northern Command, Govt]","[ORG, ORG, ORG]",[northern command],[],"[4, 6, 7, 121, 147, 293, 513, 575, 589, 650, 6...","[tplf, northern command, govt]","[tplf, northern command, govt]","[tplf, northern command, govt]",[],0.558744,4
2,2020-11-03,TPLF (rebel) attacked Northern Command (Govt.),,Dozens Killed in Ethiopia in Schoolyard Massacre,,,,,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,"[TPLF, Northern Command, Govt]","[ORG, ORG, ORG]",[northern command],[],"[4, 6, 7, 121, 147, 293, 513, 575, 589, 650, 6...","[tplf, northern command, govt]","[tplf, northern command, govt]","[tplf, northern command, govt]",[],0.495786,6
2,2020-11-03,TPLF (rebel) attacked Northern Command (Govt.),,Dozens Killed in Ethiopia in Schoolyard Massacre,,,,,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,"[TPLF, Northern Command, Govt]","[ORG, ORG, ORG]",[northern command],[],"[4, 6, 7, 121, 147, 293, 513, 575, 589, 650, 6...","[tplf, northern command, govt]","[tplf, northern command, govt]","[tplf, northern command, govt]",[],0.445129,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,2022-11-26,,,,,,,,c4c5f410f55a64c4aa2d67c6130a08f2c6da33b09e8721...,[],[],[],[],,[],[],[],[],,
756,2022-11-27,,,,,,,,d7c37e51f1d250f78100377850c61c5bdf6aed3eaba864...,[],[],[],[],,[],[],[],[],,
757,2022-11-28,,,,,,,,406562c863142969a17377973cef5bf37f1037a3c4c7a3...,[],[],[],[],,[],[],[],[],,
758,2022-11-29,,,,,,,,22663429b5ef1d463bb8e388e2b3373232b7b9ff9fbdbf...,[],[],[],[],,[],[],[],[],,


In [81]:
match_df

Unnamed: 0,event_index,event_title,article_index,article_id,article_title,similarity_score
0,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),4,4,ethiopia pm orders riposte after 'attack' on a...,0.615385
1,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),6,6,african bloc wants ceasefire as ethiopia 's ti...,0.551724
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),7,7,"ethiopia 's tigray conflict worsens, refugees ...",0.551724
3,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),121,121,ethiopia pm downplays war fears while hundreds...,0.571429
4,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),147,147,ethiopian army vows to conclude tigray offensi...,0.615385
...,...,...,...,...,...,...
5598,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3189,3189,"treasonous, barbaric attacks on endf exposed a...",0.500000
5599,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3217,3217,ethiopia 's tplf rebels agree to disarm,0.407407
5600,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3221,3221,ethiopia : tplf fighters advised to move to li...,0.478261
5601,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3271,3271,confusion prevails over withdrawal of eritrean...,0.407407


In [82]:
exploded_article = events_df[['event_id', 'article_sim_id_Title', 'similarity_scores_Title' ]]

In [83]:
exploded_article['article_sim_id_Title'] = exploded_article['article_sim_id_Title'].apply(lambda x: x if isinstance(x, list) else [])
exploded_article['similarity_scores_Title'] = exploded_article['similarity_scores_Title'].apply(lambda x: x if isinstance(x, list) else [])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exploded_article['article_sim_id_Title'] = exploded_article['article_sim_id_Title'].apply(lambda x: x if isinstance(x, list) else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exploded_article['similarity_scores_Title'] = exploded_article['similarity_scores_Title'].apply(lambda x: x if isinstance(x, list) else [])


In [84]:
match_df

Unnamed: 0,event_index,event_title,article_index,article_id,article_title,similarity_score
0,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),4,4,ethiopia pm orders riposte after 'attack' on a...,0.615385
1,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),6,6,african bloc wants ceasefire as ethiopia 's ti...,0.551724
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),7,7,"ethiopia 's tigray conflict worsens, refugees ...",0.551724
3,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),121,121,ethiopia pm downplays war fears while hundreds...,0.571429
4,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),147,147,ethiopian army vows to conclude tigray offensi...,0.615385
...,...,...,...,...,...,...
5598,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3189,3189,"treasonous, barbaric attacks on endf exposed a...",0.500000
5599,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3217,3217,ethiopia 's tplf rebels agree to disarm,0.407407
5600,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3221,3221,ethiopia : tplf fighters advised to move to li...,0.478261
5601,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3271,3271,confusion prevails over withdrawal of eritrean...,0.407407


In [85]:
exploded_article = exploded_article.explode(['article_sim_id_Title', 'similarity_scores_Title'])


In [86]:
exploded_article=exploded_article[['event_id', 'article_sim_id_Title', 'similarity_scores_Title']]


In [87]:
exploded_article = exploded_article.rename(columns={'article_sim_id_Title':'article_id', 'similarity_scores_Title': 'match_score'})

In [88]:
exploded_article.columns

Index(['event_id', 'article_id', 'match_score'], dtype='object')

In [89]:
import numpy as np

exploded_article['article_id'] = exploded_article['article_id'].replace('None', np.nan)


In [90]:
exploded_article['article_id'] = pd.to_numeric(exploded_article['article_id'], errors='coerce')


In [91]:
exploded_article[exploded_article['article_id'].notna()]

Unnamed: 0,event_id,article_id,match_score
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,4.0,0.558744
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,6.0,0.495786
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,7.0,0.445129
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,121.0,0.501652
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,147.0,0.449562
...,...,...,...
741,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,3189.0,0.550233
741,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,3217.0,0.681637
741,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,3221.0,0.627186
741,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,3271.0,0.528243


In [92]:
exploded_article.columns

Index(['event_id', 'article_id', 'match_score'], dtype='object')

In [93]:
match_df = match_df.rename(columns={'event_index':'event_id'})

In [94]:
merged = pd.merge(
    match_df,
    exploded_article[['event_id', 'article_id', 'match_score']],
    on=['event_id', 'article_id'],
    how='left'
)

# Step 3: Combine similarity scores into lists by grouping on event_id and article_id
combined = merged.groupby(['event_id', 'article_id', 'article_index', 'event_title', 'article_title'], dropna=False).agg({
    'similarity_score': list,
    'match_score': lambda x: list(x.dropna())  # avoid NaNs in the list
}).reset_index()

# If you want, you can drop the separate lists now:
# combined = combined.drop(columns=['similarity_score', 'match_score'])


In [95]:
combined

Unnamed: 0,event_id,article_id,article_index,event_title,article_title,similarity_score,match_score
0,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,4,4,Abiy declared a military offensive with state ...,ethiopia pm orders riposte after 'attack' on a...,[0.3076923076923077],[0.6526452898979187]
1,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,6,6,Abiy declared a military offensive with state ...,african bloc wants ceasefire as ethiopia 's ti...,[0.27586206896551724],[0.6211974620819092]
2,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,7,7,Abiy declared a military offensive with state ...,"ethiopia 's tigray conflict worsens, refugees ...",[0.27586206896551724],[0.606262743473053]
3,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,9,9,Abiy declared a military offensive with state ...,thousands are fleeing into sudan to escape dea...,[0.2857142857142857],[0.5164929628372192]
4,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,121,121,Abiy declared a military offensive with state ...,ethiopia pm downplays war fears while hundreds...,[0.2857142857142857],[0.5230430364608765]
...,...,...,...,...,...,...,...
5598,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3199,3199,ENDF captured Almata and Korem,ethiopia 's tplf rebels call for 'full partici...,[0.44],[0.43784910440444946]
5599,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3265,3265,ENDF captured Almata and Korem,tplf calls civilians to arms to defend themsel...,[0.4230769230769231],[0.4977000057697296]
5600,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3267,3267,ENDF captured Almata and Korem,tplf denounces death of 65 people in drone att...,[0.4074074074074074],[0.45708972215652466]
5601,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3309,3309,ENDF captured Almata and Korem,"eu, us officials call for banking, telecoms re...",[0.4444444444444444],[0.47569239139556885]


In [96]:
events_df.columns

Index(['Date', 'Major events', 'Details', 'NYT N1 (Titles)', 'NYT N2',
       'NYT N3', 'Label', 'Indicator', 'event_id', 'events_entities',
       'entities_labels', 'KeyPhrases', 'Actions', 'matching_articles',
       'normalized_entities', 'Combined_entities', 'actor_entities',
       'geo_entities', 'similarity_scores_Title', 'article_sim_id_Title'],
      dtype='object')

In [97]:
match_df

Unnamed: 0,event_id,event_title,article_index,article_id,article_title,similarity_score
0,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),4,4,ethiopia pm orders riposte after 'attack' on a...,0.615385
1,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),6,6,african bloc wants ceasefire as ethiopia 's ti...,0.551724
2,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),7,7,"ethiopia 's tigray conflict worsens, refugees ...",0.551724
3,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),121,121,ethiopia pm downplays war fears while hundreds...,0.571429
4,22b49f7df1d984b298b3cefd48edcd10bdfcce69ae4b64...,TPLF (rebel) attacked Northern Command (Govt.),147,147,ethiopian army vows to conclude tigray offensi...,0.615385
...,...,...,...,...,...,...
5598,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3189,3189,"treasonous, barbaric attacks on endf exposed a...",0.500000
5599,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3217,3217,ethiopia 's tplf rebels agree to disarm,0.407407
5600,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3221,3221,ethiopia : tplf fighters advised to move to li...,0.478261
5601,50d180485eb6d2eb0bbd059fe2ee2a01175ccc5c4e410c...,Signing ceremony of the declaration of the sen...,3271,3271,confusion prevails over withdrawal of eritrean...,0.407407


### TEMPORAL Promiximity Scores

In [98]:
article_df['Load_Date'] = pd.to_datetime(article_df['Load_Date'])

# From a research paper 
alpha_h = 1.0  # for historical
alpha_p = 0.8  # for predictive
lambda_ = 0.8
max_past_days = 30
max_future_days = 10


article_id_to_date_lookup = dict(zip(article_df['article_id'], article_df['Load_Date']))


def compute_exp_temporal_scores(events_date, pub_date):
    delta_days = (pub_date - events_date).days # Positive if Article after Event (Historic)
    if delta_days < 0:
        decay = abs(delta_days)/max_past_days
        score = alpha_h * np.exp(-lambda_ * decay)
    else:
        decay = delta_days / max_future_days
        score = alpha_p * np.exp(-lambda_ * decay)

    return round(score, 4)


def compute_temporal_scores(row):
    event_date = row['Date']
    exp_scores = {}
    matching_article_ids = row.get('matching_articles', [])
    if not isinstance(matching_article_ids, list):
        return []
        

    for aid in matching_article_ids:
        article_date = article_id_to_date_lookup.get(aid)

        if pd.notna(article_date): 
            score = compute_exp_temporal_scores(event_date, article_date)
            exp_scores[aid] = score
    return exp_scores

events_df['temporal_scores'] = events_df.progress_apply(compute_temporal_scores, axis=1)





  0%|          | 0/760 [00:00<?, ?it/s]

100%|██████████| 760/760 [00:00<00:00, 2364.98it/s]


In [99]:
events_df['temporal_scores']

0                                                     []
1                                                     []
2      {4: 0.7385, 6: 0.457, 7: 0.457, 121: 0.495, 14...
3      {4: 0.8, 6: 0.495, 7: 0.495, 9: 0.457, 121: 0....
4                                                     []
                             ...                        
755                                                   []
756                                                   []
757                                                   []
758                                                   []
759                                                   []
Name: temporal_scores, Length: 760, dtype: object

In [100]:
events_df['temporal_scores'].apply(type).value_counts()


temporal_scores
<class 'list'>    729
<class 'dict'>     31
Name: count, dtype: int64

In [101]:
exploded_scores = pd.DataFrame(
    [
        (eid, aid, score)
        for eid, scores in events_df[['event_id', 'temporal_scores']].dropna().values
        if isinstance(scores, dict)  # only process dicts here
        for aid, score in scores.items()
    ],
    columns=['event_id', 'article_id', 'temporal_score']
)


In [102]:
# Merge combined similarity scores with temporal scores on event_id and article_id
final_df = pd.merge(
    combined,
    exploded_scores,
    on=['event_id', 'article_id'],
    how='left'  # keep all from combined, add temporal_score where available
)


In [103]:
final_df['temporal_score'] = final_df['temporal_score'].apply(lambda x: [x] if pd.notnull(x) else [])


In [104]:
final_df

Unnamed: 0,event_id,article_id,article_index,event_title,article_title,similarity_score,match_score,temporal_score
0,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,4,4,Abiy declared a military offensive with state ...,ethiopia pm orders riposte after 'attack' on a...,[0.3076923076923077],[0.6526452898979187],[0.8]
1,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,6,6,Abiy declared a military offensive with state ...,african bloc wants ceasefire as ethiopia 's ti...,[0.27586206896551724],[0.6211974620819092],[0.495]
2,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,7,7,Abiy declared a military offensive with state ...,"ethiopia 's tigray conflict worsens, refugees ...",[0.27586206896551724],[0.606262743473053],[0.495]
3,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,9,9,Abiy declared a military offensive with state ...,thousands are fleeing into sudan to escape dea...,[0.2857142857142857],[0.5164929628372192],[0.457]
4,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,121,121,Abiy declared a military offensive with state ...,ethiopia pm downplays war fears while hundreds...,[0.2857142857142857],[0.5230430364608765],[0.5363]
...,...,...,...,...,...,...,...,...
5598,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3199,3199,ENDF captured Almata and Korem,ethiopia 's tplf rebels call for 'full partici...,[0.44],[0.43784910440444946],[0.4868]
5599,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3265,3265,ENDF captured Almata and Korem,tplf calls civilians to arms to defend themsel...,[0.4230769230769231],[0.4977000057697296],[0.4999]
5600,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3267,3267,ENDF captured Almata and Korem,tplf denounces death of 65 people in drone att...,[0.4074074074074074],[0.45708972215652466],[0.7261]
5601,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3309,3309,ENDF captured Almata and Korem,"eu, us officials call for banking, telecoms re...",[0.4444444444444444],[0.47569239139556885],[0.4868]


In [105]:
# OPTIONAL

# from collections import defaultdict

# event_sim_scores = defaultdict(list)

# for match in matches:
#     event_sim_scores[match['event_index']].append(match['similarity_score'])

# events_df['entity_similarity_scores'] = events_df.index.map(lambda idx: event_sim_scores.get(idx, [])) # Retrieves only index of each row in events_df

In [106]:
final_df

Unnamed: 0,event_id,article_id,article_index,event_title,article_title,similarity_score,match_score,temporal_score
0,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,4,4,Abiy declared a military offensive with state ...,ethiopia pm orders riposte after 'attack' on a...,[0.3076923076923077],[0.6526452898979187],[0.8]
1,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,6,6,Abiy declared a military offensive with state ...,african bloc wants ceasefire as ethiopia 's ti...,[0.27586206896551724],[0.6211974620819092],[0.495]
2,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,7,7,Abiy declared a military offensive with state ...,"ethiopia 's tigray conflict worsens, refugees ...",[0.27586206896551724],[0.606262743473053],[0.495]
3,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,9,9,Abiy declared a military offensive with state ...,thousands are fleeing into sudan to escape dea...,[0.2857142857142857],[0.5164929628372192],[0.457]
4,01e9c5a67f4b1fe82a00c388518a50c1a7988d61a2032d...,121,121,Abiy declared a military offensive with state ...,ethiopia pm downplays war fears while hundreds...,[0.2857142857142857],[0.5230430364608765],[0.5363]
...,...,...,...,...,...,...,...,...
5598,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3199,3199,ENDF captured Almata and Korem,ethiopia 's tplf rebels call for 'full partici...,[0.44],[0.43784910440444946],[0.4868]
5599,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3265,3265,ENDF captured Almata and Korem,tplf calls civilians to arms to defend themsel...,[0.4230769230769231],[0.4977000057697296],[0.4999]
5600,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3267,3267,ENDF captured Almata and Korem,tplf denounces death of 65 people in drone att...,[0.4074074074074074],[0.45708972215652466],[0.7261]
5601,fdc393e569fcbbd52fdb89aba5c2cc3776123435fa4545...,3309,3309,ENDF captured Almata and Korem,"eu, us officials call for banking, telecoms re...",[0.4444444444444444],[0.47569239139556885],[0.4868]


### Combined Score of Matched events



In [107]:
def convert_list_elements_to_float(lst):
    if isinstance(lst, list):
        return float(lst[0])
    else:
        return lst  # if it's not a list, just return as is

# Apply to columns
final_df['similarity_score'] = final_df['similarity_score'].apply(convert_list_elements_to_float)
final_df['match_score'] = final_df['match_score'].apply(convert_list_elements_to_float)
final_df['temporal_score'] = final_df['temporal_score'].apply(convert_list_elements_to_float)


In [108]:
import itertools

def normalize_score(score, min_value, max_value):
    return (score - min_value)/(max_value - min_value) if max_value > min_value else 0


def combined_score(row, weights, min_vals, max_vals):
    entity_ = normalize_score(row.get('similarity_score', 0), min_vals['entity_similarity'], max_vals['entity_similarity'])
    temporal_ = normalize_score(row.get('temporal_score', 0), min_vals['temporal_exp_score'], max_vals['temporal_exp_score'])
    semantic_ = normalize_score(row.get('match_score', 0), min_vals['text_similarity'], max_vals['text_similarity'])
        

    combined = (
        weights['semantic'] * semantic_ + 
        weights['entity'] * entity_ +
        weights['temporal'] * temporal_
    )

    return combined


def classify_match(score, threshold=0.4):
    return "Valid" if score >= threshold else "Invalid"


weights = {
    'semantic': 0.4,
    'entity': 0.4,
    'temporal': 0.2
}


min_vals = {
    'text_similarity': final_df['match_score'].min(),
    'entity_similarity': final_df['similarity_score'].min(),
    'temporal_exp_score': final_df['temporal_score'].min()
}

max_vals = {
    'text_similarity': final_df['match_score'].max(),
    'entity_similarity': final_df['similarity_score'].max(),
    'temporal_exp_score': final_df['temporal_score'].max()
}

final_df['final_scores'] = final_df.progress_apply(
    lambda row: combined_score(row, weights, min_vals, max_vals), axis=1
)
final_df['match_Labels'] = final_df['final_scores'].progress_apply(classify_match)

100%|██████████| 5603/5603 [00:00<00:00, 22222.74it/s]
100%|██████████| 5603/5603 [00:00<00:00, 235598.20it/s]


In [109]:
final_df['match_Labels'].value_counts()


match_Labels
Valid      3249
Invalid    2354
Name: count, dtype: int64

In [110]:
def custom_aggregator(group):
    if 'Valid' in group['match_Labels'].values:
        filtered = group[group['match_Labels'] == 'Valid'].drop_duplicates(subset='article_id')
        label = 'Valid'
    else:
        filtered = group.drop_duplicates(subset='article_id')
        label = 'Invalid'
    
    return pd.Series({
        'article_id': list(filtered['article_id']),
        'article_title': list(filtered['article_title']),
        'match_score': list(filtered['match_score']),
        'similarity_score': list(filtered['similarity_score']),
        'temporal_score': list(filtered['temporal_score']),
        'final_scores': list(filtered['final_scores']),
        'match_Labels': label,
    })

grouped_df = final_df.groupby('event_id').apply(custom_aggregator).reset_index()


  grouped_df = final_df.groupby('event_id').apply(custom_aggregator).reset_index()


In [111]:
grouped_df['match_Labels'].value_counts()

match_Labels
Valid    31
Name: count, dtype: int64

In [112]:
merged_events_df = events_df.merge(grouped_df, on='event_id', how='left')


In [113]:
merged_events_df.columns

Index(['Date', 'Major events', 'Details', 'NYT N1 (Titles)', 'NYT N2',
       'NYT N3', 'Label', 'Indicator', 'event_id', 'events_entities',
       'entities_labels', 'KeyPhrases', 'Actions', 'matching_articles',
       'normalized_entities', 'Combined_entities', 'actor_entities',
       'geo_entities', 'similarity_scores_Title', 'article_sim_id_Title',
       'temporal_scores', 'article_id', 'article_title', 'match_score',
       'similarity_score', 'temporal_score', 'final_scores', 'match_Labels'],
      dtype='object')

In [115]:
merged_events_df['match_Labels'] = merged_events_df['match_Labels'].fillna('Invalid')


In [116]:
merged_events_df.to_csv('matched_events_with_groupedscore.csv', index=False)
