# 02 NLP Analysis
Build TF-IDF features contrasting Global North vs South AI strategy texts.

In [1]:

from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

DATA_DIR = Path('..') / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
EXTERNAL_DIR = DATA_DIR / 'external'  # e.g., region mapping CSV
RESULTS_DIR = Path('..') / 'results'
TABLES_DIR = RESULTS_DIR / 'tables'
TABLES_DIR.mkdir(parents=True, exist_ok=True)


In [2]:

# Expect a CSV mapping filename -> region (north/south)
mapping_path = EXTERNAL_DIR / 'region_mapping.csv'
region_df = pd.read_csv(mapping_path)
region_df.head()


Unnamed: 0,filename,region
0,US_National_AI_Strategy_clean.txt,north
1,Canada_AI_Strategy_clean.txt,north
2,UK_AI_Strategy_clean.txt,north
3,Germany_AI_Strategy_clean.txt,north
4,France_AI_Strategy_clean.txt,north


In [3]:

texts = []
labels = []
for _, row in region_df.iterrows():
    txt_path = PROCESSED_DIR / row['filename']
    text = txt_path.read_text(encoding='utf-8')
    texts.append(text)
    labels.append(row['region'])

vectorizer = TfidfVectorizer(max_features=5000)
tfidf = vectorizer.fit_transform(texts)
feature_names = vectorizer.get_feature_names_out()


In [4]:

# Quick per-region mean scores for inspection
import numpy as np

north_mask = [lab.lower() == 'north' for lab in labels]
south_mask = [lab.lower() == 'south' for lab in labels]
north_mean = np.asarray(tfidf[north_mask].mean(axis=0)).ravel() if any(north_mask) else []
south_mean = np.asarray(tfidf[south_mask].mean(axis=0)).ravel() if any(south_mask) else []

north_top = sorted(zip(feature_names, north_mean), key=lambda x: x[1], reverse=True)[:20]
south_top = sorted(zip(feature_names, south_mean), key=lambda x: x[1], reverse=True)[:20]
north_top, south_top


([('safety', 0.2511028407070178),
  ('ai', 0.24079388086297474),
  ('ethics', 0.23702518386605476),
  ('research', 0.2185701690801001),
  ('governance', 0.1528960567674389),
  ('innovation', 0.15159620118381043),
  ('regulation', 0.1454963778869034),
  ('transparency', 0.13044756079382497),
  ('trust', 0.09670013705875179),
  ('standards', 0.09324650634282476),
  ('infrastructure', 0.07559901568365605),
  ('data', 0.06883467018185216),
  ('publicsector', 0.05869614640419468),
  ('export', 0.05346343789326499),
  ('semiconductor', 0.05346343789326499),
  ('equality', 0.05245686212261631),
  ('europe', 0.05176780486914238),
  ('sovereignty', 0.05176780486914238),
  ('sustainability', 0.051594811125061396),
  ('industry4', 0.050452709488899454)],
 [('access', 0.3090875251929278),
  ('development', 0.3090875251929278),
  ('inclusion', 0.29275466505905673),
  ('infrastructure', 0.2046632847248172),
  ('data', 0.19894190485763458),
  ('innovation', 0.19686785399569737),
  ('governance', 0.16

In [5]:

import pandas as pd

def top_to_df(pairs, region_label):
    return pd.DataFrame(pairs, columns=['term', 'tfidf']).assign(region=region_label)

north_df = top_to_df(north_top, 'north') if north_top else pd.DataFrame(columns=['term', 'tfidf', 'region'])
south_df = top_to_df(south_top, 'south') if south_top else pd.DataFrame(columns=['term', 'tfidf', 'region'])
combined = pd.concat([north_df, south_df], ignore_index=True)

north_df.to_csv(TABLES_DIR / 'north_top_terms.csv', index=False)
south_df.to_csv(TABLES_DIR / 'south_top_terms.csv', index=False)
combined.to_csv(TABLES_DIR / 'top_terms_combined.csv', index=False)

combined.head()


Unnamed: 0,term,tfidf,region
0,safety,0.251103,north
1,ai,0.240794,north
2,ethics,0.237025,north
3,research,0.21857,north
4,governance,0.152896,north
