In [1]:
import sys
from pathlib import Path

# Add the src directory to Python path
src_dir = Path("../src")
sys.path.append(str(src_dir.absolute()))

from merge_llm_results import LLMResultsMerger, LLMResultsAnalyzer


#### Initialize merge object with data path 

In [3]:
# Custom paths using temp directory
raw_data_dir = "/ephemeral/home/xiong/data/Fund/Factiva_News/2025/"
llm_results_dir = "/ephemeral/home/xiong/data/Fund/Factiva_News/results/"
output_dir = "/ephemeral/home/xiong/data/Fund/Factiva_News/enhanced/"

# Create instances with custom paths
merger = LLMResultsMerger(
    raw_data_dir=raw_data_dir,
    llm_results_dir=llm_results_dir,
    output_dir=output_dir
)

analyzer = LLMResultsAnalyzer(output_dir=output_dir)

print(f"Custom raw data directory: {merger.raw_data_dir}")
print(f"Custom LLM results directory: {merger.llm_results_dir}")
print(f"Custom output directory: {merger.output_dir}")
print(f"Custom analyzer output directory: {analyzer.output_dir}")

Custom raw data directory: /ephemeral/home/xiong/data/Fund/Factiva_News/2025
Custom LLM results directory: /ephemeral/home/xiong/data/Fund/Factiva_News/results
Custom output directory: /ephemeral/home/xiong/data/Fund/Factiva_News/enhanced
Custom analyzer output directory: /ephemeral/home/xiong/data/Fund/Factiva_News/enhanced


#### Find meragable pairs 

In [4]:
# Discover and merge file pairsprint("Discovering file pairs...")
file_pairs = merger.discover_file_pairs()
print(f"Found {len(file_pairs)} file pairs")

2025-07-18 19:19:35,592 - INFO - ✅ Found pair: 2025_articles.json ↔ 2025_articles_countries_llm.json
2025-07-18 19:19:35,593 - INFO - ✅ Found pair: 2025_articles_1.json ↔ 2025_articles_1_countries_llm.json
2025-07-18 19:19:35,594 - INFO - ✅ Found pair: 2025_articles_2.json ↔ 2025_articles_2_countries_llm.json


Found 3 file pairs


#### Merge Data

In [6]:
# Process all pairs and get stats
print("\nProcessing file pairs...")
stats = merger.process_all_file_pairs()

2025-07-18 19:21:07,794 - INFO - ✅ Found pair: 2025_articles.json ↔ 2025_articles_countries_llm.json
2025-07-18 19:21:07,795 - INFO - ✅ Found pair: 2025_articles_1.json ↔ 2025_articles_1_countries_llm.json
2025-07-18 19:21:07,796 - INFO - ✅ Found pair: 2025_articles_2.json ↔ 2025_articles_2_countries_llm.json
2025-07-18 19:21:07,797 - INFO - Processing 3 file pairs
2025-07-18 19:21:07,797 - INFO - [1/3] Processing: 2025_articles.json
2025-07-18 19:21:07,798 - INFO - Loading raw data from 2025_articles.json



Processing file pairs...


2025-07-18 19:21:11,668 - INFO - Loading LLM results from 2025_articles_countries_llm.json
Merging articles: 100%|██████████| 88861/88861 [00:00<00:00, 236387.65it/s]
2025-07-18 19:21:12,152 - INFO - Merge complete: 88861/88861 (100.0%) successful merges
2025-07-18 19:21:18,202 - INFO - ✅ Saved enhanced articles to /ephemeral/home/xiong/data/Fund/Factiva_News/enhanced/enhanced_2025_articles.json
2025-07-18 19:21:18,204 - INFO - [2/3] Processing: 2025_articles_1.json
2025-07-18 19:21:18,204 - INFO - Loading raw data from 2025_articles_1.json
2025-07-18 19:21:22,124 - INFO - Loading LLM results from 2025_articles_1_countries_llm.json
Merging articles: 100%|██████████| 89240/89240 [00:00<00:00, 168676.78it/s]
2025-07-18 19:21:22,755 - INFO - Merge complete: 89240/89240 (100.0%) successful merges
2025-07-18 19:21:29,247 - INFO - ✅ Saved enhanced articles to /ephemeral/home/xiong/data/Fund/Factiva_News/enhanced/enhanced_2025_articles_1.json
2025-07-18 19:21:29,248 - INFO - [3/3] Processing:

In [10]:
# Print summary statistics
# Print summary statistics from analyzer
print("\nMerge Summary:")
print(f"Total Articles: {stats['total_articles']:,}")
print(f"Articles with Countries: {stats['articles_with_countries']:,}")
print(f"Coverage Rate: {stats['coverage_rate']:.1f}%")
print(f"Articles with Multiple Countries: {stats['articles_with_multiple_countries']:,}")
print(f"Multi-Country Rate: {stats['multi_country_rate']:.1f}%")


Merge Summary:
Total Articles: 267,569
Articles with Countries: 257,980
Coverage Rate: 96.4%
Articles with Multiple Countries: 113,544
Multi-Country Rate: 42.4%


### Basic Analysis of merged data 

In [None]:
# Demonstrate different sampling methods
print("\nDemonstrating sampling functionality:")
all_articles = analyzer.load_enhanced_articles()


Demonstrating sampling functionality:


In [26]:
# 2. SEARCH WITH PARTIAL MATCHING
print("\n🔍 1. Random sample of articles")
print("="*60)
random_sample = analyzer.sample_articles(articles=all_articles, sample_size=10, method='random', seed=42)

# Print key fields for the random sample
print("\nSample article details:")
for article in random_sample:
    print("\n---")
    print(f"Main Country: {article.get('llm_main_country', 'N/A')}")
    print(f"Other Countries: {article.get('llm_other_countries', [])}")
    print(f"Title: {article.get('title', 'N/A')}")
    print(f"Snippet: {article.get('snippet', 'N/A')[:200]}...")


🔍 1. Random sample of articles

Sample article details:

---
Main Country: Philippines
Other Countries: []
Title: Philippine midterms: voter anger over cuts, Duterte's arrest threaten Marcos' Senate slate
Snippet: Addressing Filipinos’ concerns about bread-and-butter issues will be key to winning their votes in the midterms, analysts say

Senate candidates backed by Philippine President Ferdinand Marcos Jnr are...

---
Main Country: China
Other Countries: ['United States']
Title: China unfazed by Trump's 50% tariff threat, vows to fight to the end
Snippet: US President Donald Trump on Monday threatened China with a whopping 50 per cent additional tariff, on top of the 34 per cent levy on Chinese imports announced a few days back. In response, China said...

---
Main Country: Yemen
Other Countries: ['United States']
Title: Trump designates Yemen's Houthis as a 'foreign terrorist organization'
Snippet: WASHINGTON (Reuters) - U.S. President Donald Trump on Wednesday re-designated Yemen's

In [18]:
# 2. SEARCH WITH PARTIAL MATCHING
print("\n🔍 2. Partial matching for 'China' (finds China, Chinese, etc.)")
print("="*60)

china_articles = analyzer.search_articles_by_country("china", case_sensitive=False, match_mode='partial')
print(f"Found {len(china_articles):,} articles with partial match for 'china'")

# Show examples with different variations
print("\nExamples showing different country variations:")
for i, article in enumerate(china_articles[:3], 1):
    main_country = article.get('llm_main_country', '')
    other_countries = article.get('llm_other_countries', [])
    all_countries = [main_country] + other_countries if main_country else other_countries
    
    print(f"\n📰 Article {i}:")
    print(f"   Countries found: {[c for c in all_countries if 'china' in c.lower()]}")
    print(f"   All countries: {all_countries}")
    print(f"   Title: {article.get('title', 'No title')[:80]}...")



🔍 2. Partial matching for 'China' (finds China, Chinese, etc.)
Found 49,063 articles with partial match for 'china'

Examples showing different country variations:

📰 Article 1:
   Countries found: ['China']
   All countries: ['China', 'United States']
   Title: Fábricas chinas pausan producción y buscan mercados en Latinoamérica y Europa...

📰 Article 2:
   Countries found: ['China']
   All countries: ['China', 'United States']
   Title: China's trade countermeasures are justified self-defense against US bullying, ex...

📰 Article 3:
   Countries found: ['China']
   All countries: ['China', 'Estados Unidos']
   Title: China condena "acoso económico" de EEUU y dice que la globalización "es el único...


In [19]:
# 3. SEARCH FOR MULTIPLE COUNTRIES AT ONCE
print("\n🌐 3. Find articles mentioning multiple countries")
print("="*50)

multiple_countries = ["Japan", "Germany", "France"]
multi_articles = analyzer.search_articles_by_country("", multiple_countries=multiple_countries)
print(f"Found {len(multi_articles):,} articles mentioning Japan, Germany, or France")

# Show distribution
country_counts = {}
for article in multi_articles:
    main_country = article.get('llm_main_country', '')
    other_countries = article.get('llm_other_countries', [])
    all_countries = [main_country] + other_countries if main_country else other_countries
    
    for target in multiple_countries:
        if target in all_countries:
            country_counts[target] = country_counts.get(target, 0) + 1

print("\nCountry distribution:")
for country, count in country_counts.items():
    print(f"   {country}: {count:,} articles")

# Show one example for each
print("\nExample articles:")
shown_countries = set()
for article in multi_articles[:10]:  # Check first 10 to find examples
    main_country = article.get('llm_main_country', '')
    other_countries = article.get('llm_other_countries', [])
    all_countries = [main_country] + other_countries if main_country else other_countries
    
    for target in multiple_countries:
        if target in all_countries and target not in shown_countries:
            shown_countries.add(target)
            print(f"\n📰 {target} example:")
            print(f"   Title: {article.get('title', 'No title')[:80]}...")
            print(f"   All countries: {all_countries}")
            break
    
    if len(shown_countries) == len(multiple_countries):
        break



🌐 3. Find articles mentioning multiple countries
Found 30,947 articles mentioning Japan, Germany, or France

Country distribution:
   France: 17,644 articles
   Japan: 8,315 articles
   Germany: 9,427 articles

Example articles:

📰 France example:
   Title: Bruselas asegura que «no hay indicios de un boicot o un ciberataque»...
   All countries: ['Spain', 'Portugal', 'France']

📰 Japan example:
   Title: Las bolsas asiáticas reaccionan de forma dispar a los aranceles a China del 104%...
   All countries: ['China', 'United States', 'Japan']

📰 Germany example:
   Title: Lagarde alerta del riesgo de que Europa se quede atrás en la revolución de la IA...
   All countries: ['Germany', 'United States']


In [24]:
# 4. ADVANCED MULTI-CRITERIA SEARCH
print("\n🎯 4. Advanced search with multiple criteria")
print("="*50)

# Search for articles about specific countries from specific sources within date range
criteria_articles = analyzer.search_articles_by_multiple_criteria(
    countries=["United Kingdom", "UK", "Britain"],
    source_names=["Reuters", "Bloomberg"],  # Only from these sources
    #date_range=("2025-01-01", "2025-06-28"),    # dates doens't work for now
    case_sensitive=False
)

print(f"Found {len(criteria_articles):,} articles about UK from Reuters or Bloomberg")

if criteria_articles:
    print("\nFirst 3 examples:")
    for i, article in enumerate(criteria_articles[:3], 1):
        print(f"\n📰 Article {i}:")
        print(f"   Source: {article.get('source_name', 'N/A')}")
        print(f"   Countries: {[article.get('llm_main_country')] + article.get('llm_other_countries', [])}")
        print(f"   Title: {article.get('title', 'No title')[:80]}...")
        if article.get('publication_date'):
            print(f"   Date: {article.get('publication_date')}")
else:
    print("No articles found matching the criteria")



🎯 4. Advanced search with multiple criteria
Found 0 articles about UK from Reuters or Bloomberg
No articles found matching the criteria


In [25]:
# 5. FIND SPECIFIC ARTICLE BY ID
print("\n🔍 5. Find specific article by ID")
print("="*40)

# Get a random article ID from our dataset for demonstration
if all_articles:
    sample_article = all_articles[0]  # Use first article as example
    sample_id = sample_article.get('an')
    
    print(f"Looking for article ID: {sample_id}")
    
    found_article = analyzer.find_article_by_id(sample_id)
    if found_article:
        print("\n✅ Article found!")
        print(f"   Title: {found_article.get('title', 'No title')}")
        print(f"   Main Country: {found_article.get('llm_main_country', 'N/A')}")
        print(f"   Other Countries: {found_article.get('llm_other_countries', [])}")
        print(f"   Source: {found_article.get('source_name', 'N/A')}")
    else:
        print("❌ Article not found")



🔍 5. Find specific article by ID
Looking for article ID: SURON00020250428el4s000fj

✅ Article found!
   Title: Bruselas asegura que «no hay indicios de un boicot o un ciberataque»
   Main Country: Spain
   Other Countries: ['Portugal', 'France']
   Source: Sur Online
