In [None]:
from pathlib import Path\n
import pandas as pd\n
from sklearn.feature_extraction.text import TfidfVectorizer\n
\n
DATA_DIR = Path('..') / 'data'\n
PROCESSED_DIR = DATA_DIR / 'processed'\n
EXTERNAL_DIR = DATA_DIR / 'external'  # e.g., region mapping CSV

In [None]:
# Expect a CSV mapping filename -> region (north/south)\n
mapping_path = EXTERNAL_DIR / 'region_mapping.csv'\n
region_df = pd.read_csv(mapping_path)\n
region_df.head()

In [None]:
texts = []\n
labels = []\n
for _, row in region_df.iterrows():\n
    txt_path = PROCESSED_DIR / row['filename']\n
    text = txt_path.read_text(encoding='utf-8')\n
    texts.append(text)\n
    labels.append(row['region'])\n
\n
vectorizer = TfidfVectorizer(max_features=5000)\n
tfidf = vectorizer.fit_transform(texts)\n
feature_names = vectorizer.get_feature_names_out()

In [None]:
# Quick per-region mean scores for inspection\n
import numpy as np\n
\n
north_mask = [lab.lower() == 'north' for lab in labels]\n
south_mask = [lab.lower() == 'south' for lab in labels]\n
north_mean = np.asarray(tfidf[north_mask].mean(axis=0)).ravel() if any(north_mask) else []\n
south_mean = np.asarray(tfidf[south_mask].mean(axis=0)).ravel() if any(south_mask) else []\n
\n
north_top = sorted(zip(feature_names, north_mean), key=lambda x: x[1], reverse=True)[:20]\n
south_top = sorted(zip(feature_names, south_mean), key=lambda x: x[1], reverse=True)[:20]\n
north_top, south_top

In [None]:
import pandas as pd

def top_to_df(pairs, region_label):
    return pd.DataFrame(pairs, columns=['term', 'tfidf']).assign(region=region_label)

north_df = top_to_df(north_top, 'north') if north_top else pd.DataFrame(columns=['term', 'tfidf', 'region'])
south_df = top_to_df(south_top, 'south') if south_top else pd.DataFrame(columns=['term', 'tfidf', 'region'])
combined = pd.concat([north_df, south_df], ignore_index=True)

north_df.to_csv(TABLES_DIR / 'north_top_terms.csv', index=False)
south_df.to_csv(TABLES_DIR / 'south_top_terms.csv', index=False)
combined.to_csv(TABLES_DIR / 'top_terms_combined.csv', index=False)

combined.head()