In [None]:

#Import the libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Mount Drive
from google.colab import drive
drive.mount('/content/drive')  # you only need to run this once per session


#Load dataset

MASTER_WIDE_PATH = "/content/drive/MyDrive/GDELT/clean_wide/GDELT_TRANS_US_FULL_master_wide.tsv"

df = pd.read_csv(MASTER_WIDE_PATH, sep="\t")
print("Shape of master_wide:", df.shape)
print("First few columns:", df.columns.tolist()[:15])
df.head()


Mounted at /content/drive
Shape of master_wide: (166280, 21)
First few columns: ['date', 'year', 'month', 'year_month', 'source_collection_id', 'website', 'themes_raw', 'locations_raw', 'persons_raw', 'organizations_raw', 'tone_mean', 'tone_positive', 'tone_negative', 'tone_polarity', 'tone_activity']


Unnamed: 0,date,year,month,year_month,source_collection_id,website,themes_raw,locations_raw,persons_raw,organizations_raw,...,tone_positive,tone_negative,tone_polarity,tone_activity,tone_selfgroup,urls_raw,country,state,lat,lon
0,2021-01-18,2021,1,2021-01,1,digitalmedianet.com,GEN_HOLIDAY;TAX_FNCACT;TAX_FNCACT_DESIGNER;TAX...,1#United States#US#US#39.828175#-98.5795#US,michael keller;david rudd;bryan hartley;burton...,music group production;lighting inc,...,2.941176,0.534759,3.475936,21.925134,1.069519,http://DigitalMediaNet.com/trans-siberian-orch...,US,US,39.828175,-98.5795
1,2021-01-25,2021,1,2021-01,1,abcnewsradioonline.com,LEADER;TAX_FNCACT;TAX_FNCACT_PRESIDENT;USPEC_P...,"3#Washington, Washington, United States#US#USD...",donald trump;lloyd austin;joe biden;ivan chola...,white house;senate armed services committee,...,2.380952,3.896104,6.277056,26.839827,1.082251,http://abcnewsradioonline.com/politics-news/bi...,US,USDC,38.8951,-77.0364
2,2021-01-07,2021,1,2021-01,1,baguiomidlandcourier.com.ph,WB_698_TRADE;EPU_ECONOMY_HISTORIC;TAX_FNCACT;T...,1#Malaysia#MY#MY#2.5#112.5#MY;1#Australia#AS#A...,ramon lopez;ceferino rodolfo,pacific partnership;regional comprehensive eco...,...,4.779412,0.367647,5.147059,20.955882,0.735294,http://baguiomidlandcourier.com.ph/city.asp?mo...,MY,MY,2.5,112.5
3,2021-01-18,2021,1,2021-01,1,claycord.com,LEADER;TAX_FNCACT;TAX_FNCACT_LAWMAKER;USPEC_PO...,1#China#CH#CH#35#105#CH;1#Germany#GM#GM#51.5#1...,angela merkel;d-san francisco;rick chavez zbur...,google;university of california;do club;califo...,...,1.552665,4.448175,6.000839,25.094419,2.559799,http://claycord.com/2021/01/15/lawmaker-propos...,CH,CH,35.0,105.0
4,2021-01-24,2021,1,2021-01,1,en.protothema.gr,EDUCATION;,"2#Montana, United States#US#USMT#46.9048#-110....",brad little;joe biden,american civil liberties union;alliance defend...,...,3.581267,5.23416,8.815427,22.038567,0.550964,http://en.protothema.gr/joe-bidens-gender-disc...,US,USMT,46.9048,-110.326


In [None]:

#Create the target variable "tone_label" from "tone_mean"
# - "negative"  if tone_mean < -1
# - "positive"  if tone_mean > 1
# - "neutral"   otherwise (between -1 and 1, or missing)


df["tone_mean_num"] = pd.to_numeric(df["tone_mean"], errors="coerce")

def label_tone(x):
    """
    This function implements our rule for mapping a numeric tone_mean
    into a categorical label.

    - If x is less than -1, we call the article "negative".
    - If x is greater than 1, we call it "positive".
    - Otherwise (between -1 and 1, or missing), we call it "neutral".
    """
    if pd.isna(x):
        return "neutral"  # you could also use "unknown" if you prefer
    if x < -1:
        return "negative"
    if x > 1:
        return "positive"
    return "neutral"

# Apply the function row-by-row to create the new column
df["tone_label"] = df["tone_mean_num"].apply(label_tone)

# Check the distribution of our new labels
print("Tone label distribution:")
print(df["tone_label"].value_counts(normalize=True))


Tone label distribution:
tone_label
negative    0.648316
neutral     0.241677
positive    0.110007
Name: proportion, dtype: float64


In [None]:
#Build the feature matrix X and target vector y

feature_cols = ["tone_positive", "tone_negative", "tone_polarity",
                "tone_activity", "tone_selfgroup"]

#Convert these feature columns to numeric
for col in feature_cols:
    df[col + "_num"] = pd.to_numeric(df[col], errors="coerce")

numeric_cols = [c + "_num" for c in feature_cols]

#feature matrix X are these numeric columns
X = df[numeric_cols]

#Replace any remaining NaNs (missing values) with 0
X = X.fillna(0)

#target vector y is the tone_label column we just created
y = df["tone_label"]

print("Feature columns:", numeric_cols)
print("X shape:", X.shape)
print("y shape:", y.shape)


Feature columns: ['tone_positive_num', 'tone_negative_num', 'tone_polarity_num', 'tone_activity_num', 'tone_selfgroup_num']
X shape: (166280, 5)
y shape: (166280,)


In [None]:

#Apply the "Golden Rule" data splits


from sklearn.model_selection import train_test_split

# First split: Train (70%) vs Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,          # 30% goes into temp (val+test)
    random_state=42,         # ensures results are reproducible
    stratify=y               # keep class balance across splits
)

# Second split: Temp (30%) -> Validation (15%) and Test (15%)
# We split the temp set in half, so we get 15% / 15%.
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,          # half of temp -> test, half -> val
    random_state=42,
    stratify=y_temp
)

print("Train shape:", X_train.shape, "Labels:", y_train.value_counts(normalize=True))
print("Val shape:  ", X_val.shape,   "Labels:", y_val.value_counts(normalize=True))
print("Test shape: ", X_test.shape,  "Labels:", y_test.value_counts(normalize=True))


Train shape: (116396, 5) Labels: tone_label
negative    0.648313
neutral     0.241675
positive    0.110012
Name: proportion, dtype: float64
Val shape:   (24942, 5) Labels: tone_label
negative    0.648304
neutral     0.241681
positive    0.110015
Name: proportion, dtype: float64
Test shape:  (24942, 5) Labels: tone_label
negative    0.648344
neutral     0.241681
positive    0.109975
Name: proportion, dtype: float64


In [None]:

#Naïve Bayes + Theme Frequencies

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix


print("Dataset shape:", df.shape)


#Extract theme strings

#Replace missing theme values
df["themes_raw"] = df["themes_raw"].fillna("")

#Use CountVectorizer to convert themes into a matrix
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(";"))
X_themes = vectorizer.fit_transform(df["themes_raw"])

print("Theme feature matrix shape:", X_themes.shape)

#Target is tone_label
y = df["tone_label"]


Dataset shape: (166280, 28)




Theme feature matrix shape: (166280, 6580)


In [None]:
from sklearn.model_selection import train_test_split

#Split
X_train_t, X_temp_t, y_train_t, y_temp_t = train_test_split(
    X_themes, y, test_size=0.30, random_state=42, stratify=y
)

X_val_t, X_test_t, y_val_t, y_test_t = train_test_split(
    X_temp_t, y_temp_t, test_size=0.50, random_state=42, stratify=y_temp_t
)

print("Train shape:", X_train_t.shape)
print("Val shape:", X_val_t.shape)
print("Test shape:", X_test_t.shape)


Train shape: (116396, 6580)
Val shape: (24942, 6580)
Test shape: (24942, 6580)


In [None]:

#Train Multinomial Naïve Bayes (good for frequency data)


nb_model = MultinomialNB()
nb_model.fit(X_train_t, y_train_t)

#Validation performance
y_val_pred_nb = nb_model.predict(X_val_t)
print("=== Naïve Bayes VALIDATION ===")
print(classification_report(y_val_t, y_val_pred_nb))

#Test performance
y_test_pred_nb = nb_model.predict(X_test_t)
print("\n=== Naïve Bayes TEST ===")
print(classification_report(y_test_t, y_test_pred_nb))

#Confusion matrix
cm_nb = confusion_matrix(y_test_t, y_test_pred_nb,
                         labels=["negative","neutral","positive"])
print("\nConfusion matrix (TEST):")
print(cm_nb)


=== Naïve Bayes VALIDATION ===
              precision    recall  f1-score   support

    negative       0.75      0.83      0.79     16170
     neutral       0.43      0.32      0.36      6028
    positive       0.44      0.43      0.43      2744

    accuracy                           0.66     24942
   macro avg       0.54      0.52      0.53     24942
weighted avg       0.64      0.66      0.65     24942


=== Naïve Bayes TEST ===
              precision    recall  f1-score   support

    negative       0.75      0.83      0.79     16171
     neutral       0.43      0.31      0.36      6028
    positive       0.46      0.44      0.45      2743

    accuracy                           0.66     24942
   macro avg       0.55      0.53      0.53     24942
weighted avg       0.64      0.66      0.65     24942


Confusion matrix (TEST):
[[13477  1956   738]
 [ 3462  1877   689]
 [ 1047   501  1195]]


In [None]:
#TOP THEMES PREDICTING NEGATIVE / POSITIVE / NEUTRAL TONE


feature_names = vectorizer.get_feature_names_out()
log_probs = nb_model.feature_log_prob_

# Convert log-probabilities to a DataFrame
nb_df = pd.DataFrame(log_probs.T, columns=nb_model.classes_, index=feature_names)

print("\nTop 20 themes predicting NEGATIVE tone:")
print(nb_df.sort_values(by="negative", ascending=False).head(20))

print("\nTop 20 themes predicting POSITIVE tone:")
print(nb_df.sort_values(by="positive", ascending=False).head(20))

print("\nTop 20 themes predicting NEUTRAL tone:")
print(nb_df.sort_values(by="neutral", ascending=False).head(20))



Top 20 themes predicting NEGATIVE tone:
                                negative   neutral  positive
                               -3.669836 -3.465980 -3.339765
tax_fncact                     -3.796000 -3.679376 -3.465321
lgbt                           -4.016701 -3.929130 -3.785293
epu_policy                     -4.108567 -4.091073 -4.211793
medical                        -4.285284 -4.270882 -4.433311
uspec_politics_general1        -4.290458 -4.263516 -4.347745
education                      -4.301534 -3.979510 -4.150786
general_health                 -4.315485 -4.314650 -4.469699
leader                         -4.315539 -4.339250 -4.494427
legislation                    -4.378798 -4.463462 -4.724066
wb_615_gender                  -4.384394 -4.243580 -4.416667
soc_pointsofinterest           -4.466558 -4.384975 -4.448312
ban                            -4.495235 -4.705667 -5.400696
ungp_forests_rivers_oceans     -4.519827 -4.414718 -4.276534
crisislex_c03_wellbeing_health -4.545279 -4.

In [None]:
### CLUSTER ###

In [None]:
from sklearn.cluster import KMeans

In [None]:
# df has columns: website, tone_label, etc.

# 1. Compute tone proportions per outlet
tone_by_outlet = (
    df
    .groupby(['website', 'tone_label'])
    .size()
    .unstack(fill_value=0)
)

# convert to proportions
X = tone_by_outlet.div(tone_by_outlet.sum(axis=1), axis=0)

# 2. k-means on these proportions
k = 3
km = KMeans(n_clusters=k, random_state=42)
clusters = km.fit_predict(X)

outlet_clusters = tone_by_outlet.copy()
outlet_clusters['cluster'] = clusters
outlet_clusters.head()

tone_label,negative,neutral,positive,cluster
website,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1005freshradio.ca,2,0,0,0
1007thetiger.com,0,1,0,1
1009theeagle.com,1,0,0,0
1010wcsi.com,29,6,1,0
1011now.com,25,10,5,0


In [None]:
# X is the proportion table you clustered on
cluster_centers = pd.DataFrame(
    km.cluster_centers_,
    columns=X.columns
)
cluster_centers.index.name = "cluster"
cluster_centers

tone_label,negative,neutral,positive
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.863815,0.091917,0.044268
1,0.035098,0.921651,0.043251
2,0.029077,0.026713,0.94421


In [None]:
outlet_clusters['cluster'].value_counts()

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
0,9653
1,3033
2,2057


In [None]:
def name_cluster(row):
    # row has columns: negative, neutral, positive
    if row['negative'] > 0.7:
        return "Mostly negative outlets"
    elif row['positive'] > 0.2:
        return "More positive/neutral outlets"
    else:
        return "Mixed outlets"

cluster_centers['cluster_label'] = cluster_centers.apply(name_cluster, axis=1)
cluster_centers


tone_label,negative,neutral,positive,cluster_label
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.863815,0.091917,0.044268,Mostly negative outlets
1,0.035098,0.921651,0.043251,Mixed outlets
2,0.029077,0.026713,0.94421,More positive/neutral outlets


In [None]:
# outlet_clusters currently has counts + 'cluster'
outlet_clusters = outlet_clusters.join(
    cluster_centers['cluster_label'],
    on='cluster'
)

outlet_clusters.head()

Unnamed: 0_level_0,negative,neutral,positive,cluster,cluster_label
website,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1005freshradio.ca,2,0,0,0,Mostly negative outlets
1007thetiger.com,0,1,0,1,Mixed outlets
1009theeagle.com,1,0,0,0,Mostly negative outlets
1010wcsi.com,29,6,1,0,Mostly negative outlets
1011now.com,25,10,5,0,Mostly negative outlets


In [None]:
df_with_clusters = df.merge(
    outlet_clusters[['cluster', 'cluster_label']],
    left_on='website',
    right_index=True,
    how='left'
)

df_with_clusters.head()

Unnamed: 0,date,year,month,year_month,source_collection_id,website,themes_raw,locations_raw,persons_raw,organizations_raw,...,lon,tone_mean_num,tone_label,tone_positive_num,tone_negative_num,tone_polarity_num,tone_activity_num,tone_selfgroup_num,cluster,cluster_label
0,2021-01-18,2021,1,2021-01,1,digitalmedianet.com,GEN_HOLIDAY;TAX_FNCACT;TAX_FNCACT_DESIGNER;TAX...,1#United States#US#US#39.828175#-98.5795#US,michael keller;david rudd;bryan hartley;burton...,music group production;lighting inc,...,-98.5795,2.406417,positive,2.941176,0.534759,3.475936,21.925134,1.069519,2,More positive/neutral outlets
1,2021-01-25,2021,1,2021-01,1,abcnewsradioonline.com,LEADER;TAX_FNCACT;TAX_FNCACT_PRESIDENT;USPEC_P...,"3#Washington, Washington, United States#US#USD...",donald trump;lloyd austin;joe biden;ivan chola...,white house;senate armed services committee,...,-77.0364,-1.515152,negative,2.380952,3.896104,6.277056,26.839827,1.082251,0,Mostly negative outlets
2,2021-01-07,2021,1,2021-01,1,baguiomidlandcourier.com.ph,WB_698_TRADE;EPU_ECONOMY_HISTORIC;TAX_FNCACT;T...,1#Malaysia#MY#MY#2.5#112.5#MY;1#Australia#AS#A...,ramon lopez;ceferino rodolfo,pacific partnership;regional comprehensive eco...,...,112.5,4.411765,positive,4.779412,0.367647,5.147059,20.955882,0.735294,2,More positive/neutral outlets
3,2021-01-18,2021,1,2021-01,1,claycord.com,LEADER;TAX_FNCACT;TAX_FNCACT_LAWMAKER;USPEC_PO...,1#China#CH#CH#35#105#CH;1#Germany#GM#GM#51.5#1...,angela merkel;d-san francisco;rick chavez zbur...,google;university of california;do club;califo...,...,105.0,-2.89551,negative,1.552665,4.448175,6.000839,25.094419,2.559799,0,Mostly negative outlets
4,2021-01-24,2021,1,2021-01,1,en.protothema.gr,EDUCATION;,"2#Montana, United States#US#USMT#46.9048#-110....",brad little;joe biden,american civil liberties union;alliance defend...,...,-110.326,-1.652893,negative,3.581267,5.23416,8.815427,22.038567,0.550964,0,Mostly negative outlets


In [None]:
tone_by_cluster = (
    df_with_clusters
      .groupby(['cluster_label', 'tone_label'])
      .size()
      .groupby(level=0)
      .apply(lambda s: s / s.sum())   # convert to proportions
      .unstack()
)

tone_by_cluster

Unnamed: 0_level_0,tone_label,negative,neutral,positive
cluster_label,cluster_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mixed outlets,Mixed outlets,0.162485,0.706768,0.130747
More positive/neutral outlets,More positive/neutral outlets,0.113791,0.11544,0.770769
Mostly negative outlets,Mostly negative outlets,0.688399,0.223393,0.088208


In [None]:
outlet_clusters_reset = outlet_clusters.reset_index()  # bring 'website' out of index

outlet_clusters_reset.to_csv(
    "outlet_clusters_wide_for_tableau.csv",
    index=False
)

In [None]:
km.inertia_

927.5713730904738

In [None]:
#Article-level data with cluster labels
df_with_clusters.to_csv("gdelt_trans_articles_with_clusters.csv", index=False)


In [None]:

output_path = "/content/drive/MyDrive/GDELT/clean_wide/gdelt_trans_articles_with_clusters.csv"

df_with_clusters.to_csv(output_path, index=False)

print("Saved to:", output_path)


Saved to: /content/drive/MyDrive/GDELT/clean_wide/gdelt_trans_articles_with_clusters.csv


In [None]:
import os

print("Current working directory:", os.getcwd())

Current working directory: /content


In [None]:
metrics_df.to_csv("/content/drive/MyDrive/GDELT/model_metrics_summary.csv", index=False)
