In [1]:
# General use
import re
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import uuid
from src.lib.utils.path_finder import PROJECT_DIRECTORY
from src.lib.utils.config import config

# Notebook behavior
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
seed = config["SEED"]  # Replicability

In [None]:
# Load data
df = pd.read_csv(PROJECT_DIRECTORY.joinpath("data/raw/bq-results-20240316-113951-1710589244204.csv"), dtype={"GLOBALEVENTID":str, "MentionIdentifier":str, "QuadClass":str, "CAMEOEventRoot":str, "CAMEOEventBase":str, "CAMEOEvent":str})
df = pd.DataFrame({"uuid":[uuid.uuid4() for _ in range(len(df))]}).merge(df, left_index=True, right_index=True).set_index("uuid", drop=True)
df.shape
df.head()

In [None]:
# Read docs
docs = pd.read_parquet(PROJECT_DIRECTORY.joinpath("data/cleaned/cleaned_docs.parquet"))
docs.shape
docs.head()

In [None]:
# Check for duplicate events
# Initial assumption: GLOBALEVENTID should be unique
df["GLOBALEVENTID"].value_counts()

In [None]:
# Check where duplicates are coming from
# Findings: these are events that get multiple first-reports hence multiple documents associated with it. This can be 
ids_ = df["GLOBALEVENTID"].value_counts()[df["GLOBALEVENTID"].value_counts()>1].index.tolist()
tmp = df[df["GLOBALEVENTID"].isin(ids_)].pivot_table(index="GLOBALEVENTID", aggfunc=lambda ser: ser.nunique()>1)
tmp.sum()[tmp.sum()>0]

In [None]:
# Separate event-relevant from reporting-relevant attributes
event_attrs = [c for c in df.columns if c not in tmp.sum()[tmp.sum()>0].index.tolist()]
reporting_attrs = tmp.sum()[tmp.sum()>0].index.tolist()
assert df.shape[1] == len(event_attrs) + len(reporting_attrs)

In [None]:
# Sort dataframe
# Reason: First mention of same event per document is retained when we drop duplicates
df = df.sort_values(["GLOBALEVENTID", "MentionIdentifier", "SentenceID"], ascending=[True, True, True]).drop_duplicates(["GLOBALEVENTID", "MentionIdentifier"], keep="first")
df.shape

In [None]:
# Isolate event details
event_report_map = df[["GLOBALEVENTID", "MentionIdentifier"]]
events = df[event_attrs].drop_duplicates()
event_report_map.shape, events.shape, docs.shape

In [None]:
# How many unique events are mentioned in a single report?
vc = df["MentionIdentifier"].value_counts()
vc.describe()
_ = sns.kdeplot(vc)

# Preprocess Event Attributes

In [None]:
# Preview
events = events.set_index("GLOBALEVENTID", drop=True)  # Reindex
events.head()

## Conflict

In [None]:
# Check for nulls
conflict_codes = ["QuadClass", "CAMEOEventRoot", "CAMEOEventBase", "CAMEOEvent"]
assert np.all(events[conflict_codes].isna().sum()==0)  # Ensure there are no nulls

In [None]:
# Fill nulls in GoldsteinScore
# According to GDELT Documentation, GoldsteinScore is mapped from CAMEOEvent
df["GoldsteinScore"].isna().sum()
df["GoldsteinScore"] = df.groupby("CAMEOEvent")["GoldsteinScore"].transform(lambda x: x.fillna(x.mean()))  # Impute by getting mean of GoldsteinScore for the same CAMEOEvent
df["GoldsteinScore"] = df["GoldsteinScore"].fillna("bfill")  # If no CAMEOEvent with GoldsteinScore, use last valid value
df["GoldsteinScore"].isna().sum()

In [None]:
# Plot each feature against GoldsteinScore
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20,8), sharex=True)

for i, group in enumerate(conflict_codes):
    ax = axes[i//2, i%2]
    _ = sns.kdeplot(data=events.sort_values(group, ascending=True), x="GoldsteinScore", hue=group, fill=True, legend=False, ax=ax)
    _ = ax.set_title(group)

In [None]:
# Use rank correlation to identify conflict col that has the weakest association with  GoldsteinScore
# Consistent with visualization above, it's CAMEOEvent has the weakest and QuadClass has the strongest. Note that the differences between the 4 are actually minimal
from scipy.stats import spearmanr
alpha = 0.01
tmp = {}
for conflict_code in conflict_codes:
    p, frac = np.nan, 2**6/100
    while pd.isna(p) and (frac>=0.01):
        frac = frac / 2
        events_ = events.sample(frac=frac, random_state=seed)
        rho, p = spearmanr(events_[conflict_code], (events_["GoldsteinScore"]))
    tmp[conflict_code] = {"frac":frac, "rho":rho, "p":p, "stat_sig":p<alpha}
pd.DataFrame.from_dict(tmp, orient="index").sort_values("p", ascending=True)

In [None]:
# Create list of selected event-based attributes
selected_event_attrs = ["GoldsteinScore", "CAMEOEvent"]
blocking_rule_attrs = ["QuadClass"]

## Event date 

In [None]:
# Preview date-relevant attributes
event_dates = ["SQLDATE", "FractionDate", "EventTimeDate"]
events[event_dates].head()

In [None]:
# EventTimeDate is the most granular
# When does a story get old?
# Chakraborty et. al., 2021: 141.77 hours for NYTimes for highest lifetime impact
chakraborty_estimate = 141.77
np.round(chakraborty_estimate/24, 2)  # Equivalent to how many days?

In [None]:
# Inspect by datetime component
tmp = events["EventTimeDate"].astype(str).str.extract("(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})").astype(int)
tmp.columns = ["Year", "Month", "Day", "Hour", "Minute", "Second"]
tmp.head()  # Preview
tmp.describe()

<div class="alert alert-block alert-info">
    <ul>
        <li>Based on Chakraborty et. al. (2021) study, a news story has optimal balance between recency and relevance within 5.91 days.</li>
        <li>Among the datetime-relevant features from GDELT, EventDateTime is most granular and can be compared against Chakraborty et. al. estimate.</li>
        <li>EventDateTime is the most granular but the second attribute has no variance.</li>
        <li>Splink deduper supports this comparison between datetime objects based on their <a href="https://moj-analytical-services.github.io/splink/topic_guides/comparisons/customising_comparisons.html#date-comparison">documentation</a>.</li>
    </ul>
</div>

In [None]:
# Convert to datetime object
events["EventDateTime"] = pd.to_datetime(events["EventTimeDate"], format="%Y%m%d%H%M%S")
event_dates.append("EventDateTime")
event_attrs.append("EventDateTime")
events[event_dates].head()  # Preview

In [None]:
# Update list
selected_event_attrs.append("EventDateTime")
blocking_rule_attrs.append("SQLDATE")

## Actors

In [None]:
# Preview
actor_attrs = [attr for attr in event_attrs if attr[:5]=="Actor"]
events[actor_attrs].head()

In [None]:
# Describe
events_ = events.astype(str)[actor_attrs].replace("nan", np.nan)
tmp = events_.describe().T
tmp["maj_class_pct"] = tmp["freq"].div(tmp["count"])
tmp["sparsity"] = events_.isna().mean()

# View actors 1 and 2 separately
tmp[tmp.index.str[5]=="1"].sort_values("sparsity")
tmp[tmp.index.str[5]=="2"].sort_values("sparsity")

In [None]:
# Test for independence
from src.lib.utils.helper_functions import test_for_independence
# pairwise_comparison = lambda lst: sum([[(item, item_) for item_ in lst if item!=item_] for item in lst], [])
pairwise_comparison = lambda lst: sum([[(item, item_) for item_ in lst[i:] if item!=item_] for i, item in enumerate(lst)], [])
def get_pairwise_independence(df, features):
    comparisons = pairwise_comparison(features)
    results = {}
    for x, y in comparisons:
        series_x, series_y = df[x].fillna("__NULL__"), df[y].fillna("__NULL__")
        frac, p = 1, np.nan
        while pd.isna(p) & (frac<=0.01):
            frac = frac/2
            x_y_results = test_for_independence(series_x.sample(frac=frac, random_state=seed), series_y.sample(frac=frac, random_state=seed))
            p = x_y_results["p"]
        results[(x , y, frac)] = test_for_independence(series_x.sample(frac=frac, random_state=seed), series_y.sample(frac=frac, random_state=seed))
    return results

actor1_attrs = tmp[(tmp.index.str[5]=="1") & (tmp["sparsity"]<=0.5)].index.tolist()
pd.DataFrame.from_dict(get_pairwise_independence(events, actor1_attrs), orient="index").dropna().reset_index()

actor2_attrs = tmp[(tmp.index.str[5]=="2") & (tmp["sparsity"]<=0.5)].index.tolist()
pd.DataFrame.from_dict(get_pairwise_independence(events, actor2_attrs), orient="index").dropna().reset_index()

In [None]:
# Update list
selected_event_actor_attrs = ["Actor1Code", "Actor1Name", "Actor1Geo_FullName", "Actor2Code", "Actor2Name", "Actor2Geo_FullName"]
selected_event_attrs.extend(selected_event_actor_attrs)
blocking_rule_attrs.extend(["Actor1CountryCode", "Actor2CountryCode"])

In [None]:
# Check for most common combinations
events[selected_event_actor_attrs].fillna("__NULL__").value_counts(normalize=True).head(20).reset_index()

In [None]:
# Store in separate df
events_bu = events.copy()  # Back up original dataframe
events = events[blocking_rule_attrs + selected_event_attrs]#.fillna("__NULL__")
events.head(10)

In [None]:
# Export
events.to_parquet(PROJECT_DIRECTORY.joinpath("data/cleaned/cleaned_events.parquet"), compression="gzip")