In [1]:
# General use
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Mappers
from src.lib.utils.constants import country_code_map, event_code_map, geo_type_code_map, role_code_map, gov_arm_code_map, country_role_code_map, country_gov_arm_code_map

# Get path and env
from src.lib.utils.path_finder import PROJECT_DIRECTORY
from src.lib.utils.config import config
from src.lib.utils.env_checker import running_environment
env = running_environment()

# Notebook behavior
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
seed = config["SEED"]
filler = "__NULL__"
pd.set_option('display.max_rows', 100)

In [2]:
# Load data
fname = "cleaned_events.parquet"
if env=="sagemaker":
    fpath = f"s3://news-s3/data/cleaned/{fname}"
else:
    fpath = PROJECT_DIRECTORY.joinpath(f"data/cleaned/{fname}")
df = pd.read_parquet(fpath).fillna(np.nan)
df.shape
df.head()

(883292, 13)

Unnamed: 0_level_0,QuadClass,SQLDATE,Actor1CountryCode,Actor2CountryCode,GoldsteinScore,CAMEOEvent,EventDateTime,Actor1Code,Actor1Name,Actor1Geo_FullName,Actor2Code,Actor2Name,Actor2Geo_FullName
GLOBALEVENTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1078326026,1,20230101,,,2.8,43,2023-01-01,,,,COP,POLICE OFFICER,
1078326027,4,20230101,,,-5.0,173,2023-01-01,,,,COP,DETECTIVE,
1078326030,4,20230101,,,-10.0,190,2023-01-01,,,,COP,DETECTIVE,
1078326080,2,20230101,,RUS,10.0,874,2023-01-01,,,,RUS,MOSCOW,"Moscow, Moskva, Russia"
1078326081,2,20230101,,RUS,10.0,874,2023-01-01,,,,RUS,MOSCOW,"Monroe County, Pennsylvania, United States"


In [3]:
# Check columns
tmp = df.apply([pd.Series.nunique, lambda ser: ser.isna().mean()]).T
tmp.columns = ["nunique", "sparsity"]
tmp.sort_values("sparsity", ascending=True)

Unnamed: 0,nunique,sparsity
QuadClass,4.0,0.0
SQLDATE,441.0,0.0
GoldsteinScore,42.0,0.0
CAMEOEvent,228.0,0.0
EventDateTime,36085.0,0.0
Actor1Code,2987.0,0.077113
Actor1Name,3544.0,0.077113
Actor1Geo_FullName,9939.0,0.101719
Actor2Code,2780.0,0.259698
Actor2Name,3334.0,0.259698


In [4]:
# Check how much can be matched
country_keys = list(country_code_map.keys())
role_keys = list(role_code_map.keys())
country_role_keys = list(country_role_code_map.keys())
gov_arm_keys = list(gov_arm_code_map.keys())
country_gov_arm_keys = list(country_gov_arm_code_map.keys())
known_keys =  country_keys + role_keys + country_role_keys + gov_arm_keys + country_gov_arm_keys

fxn = lambda ser: {"Sparsity": ser.isna().mean(), 
                   "Country Keys":ser.dropna().isin(country_keys).mean(), 
                   "Role Keys":ser.dropna().isin(role_keys).mean(), 
                   "Country Role Keys":ser.dropna().isin(country_role_keys).mean(), 
                   "Gov Arm Keys":ser.dropna().isin(gov_arm_keys).mean(), 
                   "Country Gov Arm Keys":ser.dropna().isin(country_gov_arm_keys).mean(), 
                   "Known Keys":ser.dropna().isin(known_keys).mean()}
pd.DataFrame.from_dict({code:fxn(df[code])for code in ["Actor1Code", "Actor2Code"]})

Unnamed: 0,Actor1Code,Actor2Code
Sparsity,0.077113,0.259698
Country Keys,0.550827,0.549435
Role Keys,0.27419,0.267856
Country Role Keys,0.106052,0.114304
Gov Arm Keys,0.007836,0.00731
Country Gov Arm Keys,0.005899,0.004417
Known Keys,0.944805,0.943322


In [5]:
# Map

# Case 1: Can be mapped from known codes
# map_ = {**country_code_map, **role_code_map, **country_role_code_map, **gov_arm_code_map, **country_gov_arm_code_map, **{np.nan: "NULL"}}
map_ = {**country_code_map, **role_code_map, **country_role_code_map, **gov_arm_code_map, **country_gov_arm_code_map}
df["Actor1Role"] = df["Actor1Code"].map(map_).str.upper()
df["Actor2Role"] = df["Actor2Code"].map(map_).str.upper()

# Case 2: Code available but not mapped
df["Actor1Role"] = df.apply(lambda row: "UNKNOWN" if not pd.isna(row["Actor1Code"]) and pd.isna(row["Actor1Role"]) else row["Actor1Role"], axis=1)
df["Actor2Role"] = df.apply(lambda row: "UNKNOWN" if not pd.isna(row["Actor2Code"]) and pd.isna(row["Actor2Role"]) else row["Actor2Role"], axis=1)

# Case 3: No code available
df["Actor1Role"] = df["Actor1Role"].fillna("NULL")
df["Actor2Role"] = df["Actor2Role"].fillna("NULL")

In [6]:
# Map interaction between actors
map_ = {
    **{k:"COUNTRY" for k in country_keys}, 
    **{k:"ROLE" for k in role_keys}, 
    **{k:"COUNTRY ROLE" for k in country_role_keys}, 
    **{k:"GOV ARM" for k in gov_arm_keys}, 
    **{k:"COUNTRY GOV ARM" for k in country_gov_arm_keys}, 
    **{"UNKNOWN": "UNKNOWN", "NULL": "NULL"}
}
df["Actor1Group"] = df["Actor1Code"].map(map_)
df["Actor2Group"] = df["Actor2Code"].map(map_)
df["Actor1Group"] = df.apply(lambda row: row["Actor1Role"] if row["Actor1Role"] in ["UNKNOWN", "NULL"] else row["Actor1Group"], axis=1)
df["Actor2Group"] = df.apply(lambda row: row["Actor2Role"] if row["Actor2Role"] in ["UNKNOWN", "NULL"] else row["Actor1Group"], axis=1)

df["ActorRoleInteraction"] = df["Actor1Group"] + "-" + df["Actor2Group"]
df["ActorEquivalentRoles"] = df["Actor1Group"] == df["Actor2Group"]

df["ActorRoleInteraction"].value_counts(normalize=True).head()
df["ActorEquivalentRoles"].value_counts(normalize=True).head()

ActorRoleInteraction
COUNTRY-COUNTRY              0.352747
ROLE-ROLE                    0.151077
COUNTRY-NULL                 0.132287
ROLE-NULL                    0.094211
COUNTRY ROLE-COUNTRY ROLE    0.076466
Name: proportion, dtype: float64

ActorEquivalentRoles
True     0.701982
False    0.298018
Name: proportion, dtype: float64

In [7]:
# Check if code lengths are in multiples of 3
tmp = df[[c for c in df.columns if c[-4:]=="Code"]].melt(value_name="code", var_name="code_type").dropna()
tmp["code_len"] = tmp["code"].str.len()
tmp.pivot_table(index="code_type", columns="code_len", values="code", aggfunc="count", fill_value=0)

code_len,3,6,9,12,15
code_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Actor1Code,682852,109788,19598,2793,148
Actor1CountryCode,555669,0,0,0,0
Actor2Code,543256,93206,15155,2156,130
Actor2CountryCode,449358,0,0,0,0


In [8]:
# Check if there are unexpected values among country codes
country_codes_w_in_data = tmp.query("code_type.isin(['Actor1CountryCode', 'Actor2CountryCode'])")["code"]
country_codes_w_in_data[country_codes_w_in_data.map(country_code_map).isna()].drop_duplicates().values.tolist()

[]

In [9]:
# Recode by splitting into 3-chars
insert_wspace_every_3_chars = lambda x: " ".join([x[i:i+3] for i in range(0, len(str(x)), 3)])
df["Actor1Code"] = df["Actor1Code"].apply(lambda x: insert_wspace_every_3_chars(str(x)) if (len(str(x))<3) or pd.isna(x)==False else x)
df["Actor2Code"] = df["Actor2Code"].apply(lambda x: insert_wspace_every_3_chars(str(x)) if (len(str(x))<3) or pd.isna(x)==False else x)

<div class="alert alert-block alert-info">
    <ul>
        <li> No issues found in <b>QuadClass, SQLDATE, CAMEOEvent, EventDateTime</b>.</li>
        <li><b>Actor1CountryCode</b> and <b>Actor2CountryCode</b> represents only one country at a time using string of charater length 3. No unexpected values found. <i>Exact string-matching rule is appropriate.</i></li>
        <li><b>Actor1CCode</b> and <b>Actor2Code</b> has character lengths in multiples of 3 between 3 to 15. This translates to 1-5 coded groups. <i>Jaccard or token set ratio are appropriate string comparison metrics.</i></li>
    </ul>
</div>


In [10]:
# Use up to 3
df["Actor1Geo_FullName"].dropna().str.count(",").value_counts(normalize=True).sort_index().cumsum()

Actor1Geo_FullName
0    0.207913
1    0.243015
2    0.973200
3    0.999995
4    1.000000
Name: proportion, dtype: float64

In [11]:
# Split geo into sections
df["Actor1Geo_Country"] = df["Actor1Geo_FullName"].str.split(",").str[-1]
df["Actor1Geo_Region"] = df["Actor1Geo_FullName"].str.split(",").str[-2]
df["Actor1Geo_City"] = df["Actor1Geo_FullName"].str.split(",").str[-3]
df["Actor2Geo_Country"] = df["Actor2Geo_FullName"].str.split(",").str[-1]
df["Actor2Geo_Region"] = df["Actor2Geo_FullName"].str.split(",").str[-2]
df["Actor2Geo_City"] = df["Actor2Geo_FullName"].str.split(",").str[-3]

df.iloc[:10,-6:]  # Preview

Unnamed: 0_level_0,Actor1Geo_Country,Actor1Geo_Region,Actor1Geo_City,Actor2Geo_Country,Actor2Geo_Region,Actor2Geo_City
GLOBALEVENTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1078326026,,,,,,
1078326027,,,,,,
1078326030,,,,,,
1078326080,,,,Russia,Moskva,Moscow
1078326081,,,,United States,Pennsylvania,Monroe County
1078326209,China,Beijing,Beijing,,,
1078326246,,,,,,
1078326357,Russia,Moskva,Moscow,Russia,Moskva,Moscow
1078326359,Russia,Moskva,Moscow,Russia,Moskva,Moscow
1078326399,United States,Pennsylvania,Monroe County,,,


In [12]:
# Limit to 1 year
lim = df["SQLDATE"].drop_duplicates().sort_values(ascending=False)[365]
df = df[df["SQLDATE"]>=lim]
df = df.drop(columns=["SQLDATE"], axis=1)
df.shape

  lim = df["SQLDATE"].drop_duplicates().sort_values(ascending=False)[365]


(756655, 24)

In [13]:
# Rearrage cols
df = df[[
    "EventDateTime",
    "QuadClass",
    "CAMEOEvent",
    "GoldsteinScore",
    "Actor1Name",
    "Actor1Code",
    "Actor1Role",
    "Actor1Group",
    "Actor2Name",
    "Actor2Code",
    "Actor2Role",
    "Actor2Group",
    "ActorRoleInteraction",
    "ActorEquivalentRoles",
    "Actor1CountryCode",
    "Actor1Geo_FullName",
    "Actor1Geo_Country",
    "Actor1Geo_Region",
    "Actor1Geo_City",
    "Actor2CountryCode",
    "Actor2Geo_FullName",
    "Actor2Geo_Country",
    "Actor2Geo_Region",
    "Actor2Geo_City"
]]

In [14]:
# Describe
df.describe()

df.describe(include="O")

Unnamed: 0,EventDateTime,GoldsteinScore
count,756655,756655.0
mean,2023-09-11 09:21:44.880163328,0.545647
min,2023-03-17 00:00:00,-10.0
25%,2023-06-20 04:00:00,-2.0
50%,2023-09-15 01:00:00,1.9
75%,2023-11-27 05:15:00,3.4
max,2024-03-16 11:30:00,10.0
std,,4.6352


Unnamed: 0,QuadClass,CAMEOEvent,Actor1Name,Actor1Code,Actor1Role,Actor1Group,Actor2Name,Actor2Code,Actor2Role,Actor2Group,...,Actor1CountryCode,Actor1Geo_FullName,Actor1Geo_Country,Actor1Geo_Region,Actor1Geo_City,Actor2CountryCode,Actor2Geo_FullName,Actor2Geo_Country,Actor2Geo_Region,Actor2Geo_City
count,756655,756655,699026,699026,756655,756655,561732,561732,756655.0,756655,...,479253,681511,681511,539644,515136,388140,551933,551933,434612,416573
unique,4,227,3404,2810,1721,7,3190,2604,1627.0,7,...,216,9065,418,1750,8417,214,8202,414,1684,7601
top,1,10,PHILIPPINE,PHL,PHILIPPINES,COUNTRY,PHILIPPINE,PHL,,COUNTRY,...,PHL,"Manila, Manila, Philippines",Philippines,Manila,Manila,PHL,"Manila, Manila, Philippines",Philippines,Manila,Manila
freq,473878,65987,107013,123312,123312,386584,79886,87107,194923.0,268965,...,144014,55608,235183,55765,55608,104685,43015,178945,43136,43015


In [15]:
from src.lib.utils.helper_functions import test_for_independence
cat_cols = df.select_dtypes(include="O").columns
pairwise_cols = [(c, c_) for i, c in enumerate(cat_cols) for c_ in cat_cols[i:] if c!=c_]
results = {}
for a, b in pairwise_cols:
    subset = df[[a, b]].dropna()
    if subset.size > 1:
        try:
            results[(a, b)] = test_for_independence(subset[a], subset[b])
        except Exception as e:
            if "No data; `observed` has size 0." in e.args[0]:
                print(f"Crosstab between {a}, {b} has 0s.")
results = pd.DataFrame.from_dict(results, orient="index")
results["S.Sig"] = results["P-value"] < 0.01
results



Crosstab between QuadClass, CAMEOEvent has 0s.
Crosstab between QuadClass, Actor1Name has 0s.
Crosstab between QuadClass, Actor1Code has 0s.
Crosstab between QuadClass, Actor1Role has 0s.
Crosstab between QuadClass, Actor1Group has 0s.
Crosstab between QuadClass, Actor2Name has 0s.
Crosstab between QuadClass, Actor2Code has 0s.
Crosstab between QuadClass, Actor2Role has 0s.
Crosstab between QuadClass, Actor2Group has 0s.
Crosstab between QuadClass, ActorRoleInteraction has 0s.
Crosstab between QuadClass, Actor1CountryCode has 0s.
Crosstab between QuadClass, Actor1Geo_FullName has 0s.
Crosstab between QuadClass, Actor1Geo_Country has 0s.
Crosstab between QuadClass, Actor1Geo_Region has 0s.
Crosstab between QuadClass, Actor1Geo_City has 0s.
Crosstab between QuadClass, Actor2CountryCode has 0s.
Crosstab between QuadClass, Actor2Geo_FullName has 0s.
Crosstab between QuadClass, Actor2Geo_Country has 0s.
Crosstab between QuadClass, Actor2Geo_Region has 0s.
Crosstab between QuadClass, Actor2G

Unnamed: 0,Unnamed: 1,Test,Statistic,P-value,S.Sig
Actor1Name,Actor1Role,CHI SQUARED,44950.272543,0.0,True
Actor1Name,Actor2Name,CHI SQUARED,21388.641825,0.0,True
Actor1Name,Actor2Role,CHI SQUARED,32493.030139,0.0,True
Actor1Code,Actor1Role,CHI SQUARED,0.0,1.0,False
Actor1Code,Actor2Code,CHI SQUARED,13160.09316,0.0,True
Actor1Code,Actor2Role,CHI SQUARED,0.0,1.0,False
Actor1Code,Actor1CountryCode,CHI SQUARED,4519.141007,0.0,True
Actor1Code,Actor2CountryCode,CHI SQUARED,7204.103893,0.0,True
Actor1Role,Actor1Group,CHI SQUARED,0.0,1.0,False
Actor1Role,Actor2Name,CHI SQUARED,30612.440179,0.0,True


In [16]:
# Export
df.to_parquet(PROJECT_DIRECTORY.joinpath("data/explored/explored_docs.parquet"), index=True, compression="gzip")