In [9]:
import pandas as pd
import pickle

In [40]:
# Load the pickle files and understand their structure
with open('data/id_english_location_name.pkl', 'rb') as f:
    english_locations = pickle.load(f)

with open('data/id_arabic_location_name.pkl', 'rb') as f:
    arabic_locations = pickle.load(f)

print(f"English locations: {len(english_locations)} entries")
print(f"Arabic locations: {len(arabic_locations)} entries")



English locations: 357 entries
Arabic locations: 357 entries


dict_keys(['iq', 'jo', 'lb', 'ps', 'sy', 'iq_an', 'iq_ar', 'iq_bb', 'iq_bg', 'iq_ba', 'iq_da', 'iq_dq', 'iq_qa', 'iq_di', 'iq_ka', 'iq_ts', 'iq_ma', 'iq_mu', 'iq_na', 'iq_ni', 'iq_sd', 'iq_sl', 'iq_wa', 'jo_aj', 'jo_am', 'jo_aq', 'jo_ba', 'jo_ir', 'jo_ja', 'jo_ka', 'jo_mn', 'jo_md', 'jo_ma', 'jo_at', 'jo_az', 'lb_ak', 'lb_bh', 'lb_ba', 'lb_bq', 'lb_00_0', 'lb_jl', 'lb_na', 'lb_nl', 'lb_ja', 'ps_gz', 'ps_wb', 'sy_hl', 'sy_di', 'sy_dr', 'sy_dy', 'sy_hm', 'sy_ha', 'sy_hi', 'sy_id', 'sy_la', 'sy_qu', 'sy_ra', 'sy_rd', 'sy_su', 'sy_ta', 'iq_ba_1', 'iq_bg_1', 'iq_sd_1', 'iq_bg_2', 'iq_qa_1', 'iq_ka_1', 'iq_ni_1', 'iq_ma_4', 'iq_da_1', 'iq_ma_5', 'iq_an_3', 'iq_ni_4', 'iq_00_0', 'iq_wa_1', 'iq_ni_2', 'iq_di_3', 'iq_wa_5', 'iq_00_1', 'iq_sd_5', 'iq_ni_3', 'iq_sd_6', 'iq_di_4', 'iq_ba_5', 'iq_00_2', 'iq_sl_1', 'iq_dq_1', 'iq_ar_1', 'iq_ts_2', 'iq_da_2', 'iq_ts_3', 'iq_sl_2', 'iq_qa_3', 'iq_sl_3', 'iq_sd_2', 'iq_ar_2', 'iq_an_4', 'iq_ba_6', 'iq_an_5', 'iq_wa_2', 'iq_sl_4', 'iq_qa_4', 'iq_bb_3', 

In [49]:
# Create separate English and Arabic DataFrames with hierarchical structure for inspection
# perhaps it is still easier to match with dicts later for analysis but I am better with understanding dataframes
def parse_location_hierarchy(location_id):
    """Parse the hierarchical structure from location_id"""
    parts = location_id.split('_')
    
    if len(parts) == 1:
        return {
            'level': 'country',
            'country_code': parts[0],
            'province_code': None,
            'district_number': None
        }
    elif len(parts) == 2:
        return {
            'level': 'province', 
            'country_code': parts[0],
            'province_code': parts[1],
            'district_number': None
        }
    elif len(parts) == 3:
        return {
            'level': 'district',
            'country_code': parts[0], 
            'province_code': parts[1],
            'district_number': parts[2]
        }
    else:
        return {
            'level': 'unknown',
            'country_code': None,
            'province_code': None, 
            'district_number': None
        }

# Create English DataFrame
english_data = []
for location_id, names in english_locations.items():
    hierarchy = parse_location_hierarchy(location_id)
    
    # Handle both single strings and lists
    if isinstance(names, str):
        names = [names]
    elif hasattr(names, 'tolist'):  # numpy array
        names = names.tolist()
    
    english_data.append({
        'location_id': location_id,
        'level': hierarchy['level'],
        'country_code': hierarchy['country_code'],
        'province_code': hierarchy['province_code'],
        'district_number': hierarchy['district_number'],
        'names': names,
        'primary_name': names[0] if names else None,
        'name_count': len(names)
    })

english_df = pd.DataFrame(english_data)

# Show the hierarchy
print(f"\nLevel distribution:")
print(english_df['level'].value_counts())

print(f"\nCountry codes: {english_df['country_code'].unique()}")
english_df.head(10)


Level distribution:
level
district    298
province     54
country       5
Name: count, dtype: int64

Country codes: ['iq' 'jo' 'lb' 'ps' 'sy']


Unnamed: 0,location_id,level,country_code,province_code,district_number,names,primary_name,name_count
0,iq,country,iq,,,[iraq],iraq,1
1,jo,country,jo,,,[jordan],jordan,1
2,lb,country,lb,,,[lebanon],lebanon,1
3,ps,country,ps,,,"[occupied palestinian territory, palestine, pa...",occupied palestinian territory,4
4,sy,country,sy,,,"[syria, syrian arab republic]",syria,2
5,iq_an,province,iq,an,,[anbar],anbar,1
6,iq_ar,province,iq,ar,,"[arbil, erbil]",arbil,2
7,iq_bb,province,iq,bb,,"[babil, babylon]",babil,2
8,iq_bg,province,iq,bg,,[baghdad],baghdad,1
9,iq_ba,province,iq,ba,,"[basra, basrah]",basra,2


In [50]:
# Create Arabic DataFrame with same structure
arabic_data = []
for location_id, names in arabic_locations.items():
    hierarchy = parse_location_hierarchy(location_id)
    
    # Handle both single strings and lists
    if isinstance(names, str):
        names = [names]
    elif hasattr(names, 'tolist'):  # numpy array
        names = names.tolist()
    
    arabic_data.append({
        'location_id': location_id,
        'level': hierarchy['level'],
        'country_code': hierarchy['country_code'],
        'province_code': hierarchy['province_code'],
        'district_number': hierarchy['district_number'],
        'names': names,
        'primary_name': names[0] if names else None,
        'name_count': len(names)
    })

arabic_df = pd.DataFrame(arabic_data)

print(f"Arabic DataFrame: {arabic_df.shape}")
print(f"Columns: {list(english_df.columns)}")

Arabic DataFrame: (357, 8)
Columns: ['location_id', 'level', 'country_code', 'province_code', 'district_number', 'names', 'primary_name', 'name_count']


# Conclusion 
As it is hierarchical I'd assume that if we match on district level we can automatically assign back to province and country as well. So perhaps we should "tag" the news articles with country, province and district code/number - based on occurences of names and primary names in the article. 

Multiple matches/tags are of course possible so need to be careful on how to do that. We could work with binary tags, but then would append 300+ columns which seems super inefficient and will lead to sparse data.

In [None]:
# ======================================================================
# LOCATION MATCHING AND PRIMARY LOCATION SCRIPTS FOR NEWS ANALYSIS
# (To be used later when processing news articles)
# ======================================================================

def create_location_lookup(df):
    """Create a lookup dictionary for matching location names in text"""
    lookup = {}
    
    for _, row in df.iterrows():
        location_info = {
            'location_id': row['location_id'],
            'level': row['level'],
            'country_code': row['country_code'],
            'province_code': row['province_code'],
            'district_number': row['district_number']
        }
        
        # Add all name variations to lookup
        for name in row['names']:
            name_clean = name.lower().strip()
            if name_clean not in lookup:
                lookup[name_clean] = []
            lookup[name_clean].append(location_info)
    
    return lookup

def determine_primary_location(title, body, location_matches):
    """
    Determine primary location from multiple location mentions using scoring algorithm
    
    Args:
        title: Article title text
        body: Article body text  
        location_matches: List of matched location dictionaries
    
    Returns:
        dict: Primary location info + secondary locations list
    """
    
    if not location_matches:
        return {'primary': None, 'secondary': [], 'method': 'no_matches'}
    
    if len(location_matches) == 1:
        return {'primary': location_matches[0], 'secondary': [], 'method': 'single_match'}
    
    # Score each location
    for location in location_matches:
        score = 0
        location_name = location.get('matched_name', '').lower()
        
        # Specificity bonus (district=3, province=2, country=1)
        level_scores = {'district': 3, 'province': 2, 'country': 1}
        score += level_scores.get(location['level'], 0)
        
        # Title mention bonus
        if location_name in title.lower():
            score += 5
            
        # Frequency bonus (count mentions in title + body)
        full_text = (title + ' ' + body).lower()
        mention_count = full_text.count(location_name)
        score += mention_count * 0.5
        
        # Context bonus (simple check for action words)
        action_patterns = [f'in {location_name}', f'{location_name} faces', f'{location_name} experiences']
        for pattern in action_patterns:
            if pattern in full_text:
                score += 3
                break
                
        # Emphasis bonus (affected, hit, impacted)
        emphasis_patterns = [f'{location_name} most', f'{location_name} severely', f'{location_name} heavily']
        for pattern in emphasis_patterns:
            if pattern in full_text:
                score += 2
                break
                
        location['primary_score'] = score
    
    # Sort by score and select primary
    location_matches.sort(key=lambda x: x['primary_score'], reverse=True)
    primary = location_matches[0]
    secondary = location_matches[1:]
    
    return {
        'primary': primary,
        'secondary': secondary,
        'method': 'scored',
        'primary_score': primary['primary_score']
    }

def tag_article_with_geography(title, body, english_lookup, arabic_lookup):
    """
    Complete function to tag an article with geographic information
    
    Args:
        title: Article title
        body: Article body text
        english_lookup: English location name lookup dictionary
        arabic_lookup: Arabic location name lookup dictionary
    
    Returns:
        dict: Geographic tags for the article
    """
    
    # Find all location matches
    all_matches = []
    text_combined = (title + ' ' + body).lower()
    
    # Check English locations
    for name, location_infos in english_lookup.items():
        if name in text_combined:
            for location_info in location_infos:
                location_info = location_info.copy()
                location_info['matched_name'] = name
                location_info['language'] = 'english'
                all_matches.append(location_info)
    
    # Check Arabic locations  
    for name, location_infos in arabic_lookup.items():
        if name in text_combined:
            for location_info in location_infos:
                location_info = location_info.copy()
                location_info['matched_name'] = name
                location_info['language'] = 'arabic'
                all_matches.append(location_info)
    
    # Remove duplicates (same location_id)
    unique_matches = []
    seen_ids = set()
    for match in all_matches:
        if match['location_id'] not in seen_ids:
            unique_matches.append(match)
            seen_ids.add(match['location_id'])
    
    # Determine primary location
    result = determine_primary_location(title, body, unique_matches)
    
    # Format final result
    if result['primary']:
        primary = result['primary']
        geographic_tags = {
            'primary_country': primary['country_code'],
            'primary_province': primary['province_code'],
            'primary_district': primary['district_number'],
            'primary_level': primary['level'],
            'primary_location_id': primary['location_id'],
            'secondary_locations': [loc['location_id'] for loc in result['secondary']],
            'total_locations_found': len(unique_matches),
            'geographic_scope': 'multi_location' if len(unique_matches) > 1 else 'single_location',
            'cross_border': len(set(match['country_code'] for match in unique_matches)) > 1,
            'tagging_confidence': result.get('primary_score', 0),
            'tagging_method': result['method']
        }
    else:
        geographic_tags = {
            'primary_country': None,
            'primary_province': None,
            'primary_district': None,
            'primary_level': None,
            'primary_location_id': None,
            'secondary_locations': [],
            'total_locations_found': 0,
            'geographic_scope': 'no_location',
            'cross_border': False,
            'tagging_confidence': 0,
            'tagging_method': 'no_matches'
        }
    
    return geographic_tags

print("Geographic tagging functions defined:")
print("- create_location_lookup(): Create name->location lookup dictionary")
print("- determine_primary_location(): Score and select primary location from matches")
print("- tag_article_with_geography(): Complete article tagging pipeline")
print("\nThese will be used when processing news articles.")
