# Company Name Matching - Development Notebook

Use this notebook to test and develop matching algorithms before deploying to the Streamlit app.

## Setup

In [None]:
import pandas as pd
import requests
from fuzzywuzzy import fuzz
from name_cleaner import NameCleaner
import pdfplumber
import time

## Test OpenCorporates API

In [None]:
def search_opencorporates(company_name):
    """Search OpenCorporates API"""
    url = "https://api.opencorporates.com/v0.4/companies/search"
    params = {
        'q': company_name,
        'per_page': 5
    }
    
    response = requests.get(url, params=params, timeout=10)
    
    if response.status_code == 200:
        data = response.json()
        return data
    
    return None

# Test with a known company
test_name = "Microsoft Corporation"
results = search_opencorporates(test_name)

if results:
    companies = results.get('results', {}).get('companies', [])
    print(f"Found {len(companies)} matches for '{test_name}':\n")
    
    for item in companies:
        company = item['company']
        print(f"Name: {company.get('name')}")
        print(f"Jurisdiction: {company.get('jurisdiction_code')}")
        print(f"Number: {company.get('company_number')}")
        print(f"Status: {company.get('current_status')}")
        print("-" * 60)

## Test Name Cleaning

In [None]:
cleaner = NameCleaner()

test_names = [
    "Amazon.com, Inc.",
    "Volkswagen AG (Deutschland)",
    "Société Générale S.A.",
    "HSBC Holdings plc",
    "Samsung Electronics Co., Ltd."
]

results = []
for name in test_names:
    results.append({
        'original': name,
        'normalized': cleaner.normalize_name(name),
        'core': cleaner.extract_core_name(name)
    })

df = pd.DataFrame(results)
df

## Test String Similarity

In [None]:
def compare_names(name1, name2):
    """Compare two names using multiple algorithms"""
    s1 = name1.lower()
    s2 = name2.lower()
    
    return {
        'name1': name1,
        'name2': name2,
        'ratio': fuzz.ratio(s1, s2),
        'partial_ratio': fuzz.partial_ratio(s1, s2),
        'token_sort': fuzz.token_sort_ratio(s1, s2),
        'token_set': fuzz.token_set_ratio(s1, s2)
    }

# Test variations
comparisons = [
    ("Microsoft Corporation", "Microsoft Corp"),
    ("Amazon.com, Inc.", "Amazon Inc"),
    ("Google LLC", "Google Limited Liability Company"),
    ("Volkswagen AG", "Volkswagen Aktiengesellschaft")
]

results = [compare_names(n1, n2) for n1, n2 in comparisons]
pd.DataFrame(results)

## Test PDF Extraction

In [None]:
# Example: Load and extract from a test PDF
# Replace 'test.pdf' with your actual test file

def extract_names_from_pdf(pdf_path):
    """Extract company names from PDF"""
    names = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            text = page.extract_text()
            if text:
                lines = text.split('\n')
                for line in lines:
                    if line.strip():
                        names.append({
                            'name': line.strip(),
                            'page': page_num
                        })
    
    return pd.DataFrame(names)

# Example usage:
# df_names = extract_names_from_pdf('test.pdf')
# df_names.head(20)

## Batch Matching Test

In [None]:
# Test with multiple names
test_companies = [
    "Microsoft Corporation",
    "Apple Inc",
    "Amazon.com",
    "Alphabet Inc",
    "Meta Platforms"
]

results = []

for company in test_companies:
    print(f"Searching for: {company}")
    
    api_result = search_opencorporates(company)
    
    if api_result and api_result.get('results', {}).get('companies'):
        match = api_result['results']['companies'][0]['company']
        
        results.append({
            'original': company,
            'matched': match.get('name'),
            'jurisdiction': match.get('jurisdiction_code'),
            'number': match.get('company_number'),
            'similarity': fuzz.ratio(
                company.lower(), 
                match.get('name', '').lower()
            )
        })
    else:
        results.append({
            'original': company,
            'matched': None,
            'jurisdiction': None,
            'number': None,
            'similarity': 0
        })
    
    # Rate limiting
    time.sleep(0.5)

pd.DataFrame(results)

## Export Test Results

In [None]:
# Export results to CSV
df_results = pd.DataFrame(results)
df_results.to_csv('test_results.csv', index=False)
print("Results exported to test_results.csv")

## References

- [OpenCorporates API Documentation](https://api.opencorporates.com/documentation/API-Reference)
- [FuzzyWuzzy Documentation](https://github.com/seatgeek/fuzzywuzzy)
- [pdfplumber Documentation](https://github.com/jsvine/pdfplumber)
- [pandas Documentation](https://pandas.pydata.org/docs/)