In [1]:
"""
CONGRESSIONAL MEMBERS DATASET BUILDER - HISTORICAL + CURRENT
Per√≠odo: 2020-2026
Fuente:  GitHub
"""

import pandas as pd
import requests
import yaml
from datetime import datetime
import numpy as np

# ============================================================
# CONFIGURACI√ìN
# ============================================================

ANALYSIS_START = '2020-01-01'
ANALYSIS_END = '2026-12-31'
RELEVANT_CONGRESSES = [116, 117, 118, 119]

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}



# ============================================================
# FUNCI√ìN AUXILIAR PARA DESCARGAR YAML
# ============================================================

def download_yaml(file_type):
    """
    file_type: 'current' o 'historical'
    """
    print(f"\n‚Üí Descargando legislators-{file_type}.yaml...")
    
    # Probar m√∫ltiples URLs
    possible_urls = [
        f'https://raw.githubusercontent.com/unitedstates/congress-legislators/master/legislators-{file_type}.yaml',
        f'https://raw.githubusercontent.com/unitedstates/congress-legislators/main/legislators-{file_type}.yaml',
        f'https://theunitedstates.io/congress-legislators/legislators-{file_type}.yaml'
    ]
    
    for url in possible_urls:
        try:
            response = requests.get(url, headers=HEADERS, timeout=60)
            response.raise_for_status()
            data = yaml.safe_load(response.text)
            print(f"   ‚úÖ Descargado: {len(data)} miembros desde {url[:50]}...")
            return data
        except Exception as e:
            print(f"   ‚ùå Fall√≥ {url[:50]}... ({e})")
            continue
    


# ============================================================
# PASO 1: DESCARGAR AMBOS YAMLs
# ============================================================


# Descargar historical
historical_yaml = download_yaml('historical')

# Descargar current
current_yaml = download_yaml('current')

# Verificar que tenemos al menos uno
if not historical_yaml and not current_yaml:
    print("\n‚ùå ERROR CR√çTICO: No se pudieron descargar los YAMLs")
    print("   Descarga manualmente desde:")
    print("   https://github.com/unitedstates/congress-legislators")
    exit(1)

# Combinar
all_legislators = []

if historical_yaml:
    for leg in historical_yaml:
        leg['source'] = 'historical'
        all_legislators.append(leg)

if current_yaml:
    for leg in current_yaml:
        leg['source'] = 'current'
        all_legislators.append(leg)


# ============================================================
# PASO 2: EXTRAER Y FILTRAR POR PER√çODO 2020-2026
# ============================================================



members_data = []
committees_data = []

for legislator in all_legislators:
    try:
        # IDs
        ids = legislator.get('id', {})
        bioguide_id = ids.get('bioguide')
        
        if not bioguide_id:
            continue
        
        # Info personal
        name = legislator.get('name', {})
        bio = legislator.get('bio', {})
        
        # T√©rminos
        terms = legislator.get('terms', [])
        
        if not terms:
            continue
        
        # *** FILTRAR T√âRMINOS POR PER√çODO 2020-2026 ***
        relevant_terms = []
        for term in terms:
            term_start = pd.to_datetime(term.get('start'), errors='coerce')
            term_end = pd.to_datetime(term.get('end'), errors='coerce')
            
            # Solo t√©rminos que se solapan con 2020-2026
            if pd.notna(term_start) and pd.notna(term_end):
                if term_start <= pd.Timestamp(ANALYSIS_END) and term_end >= pd.Timestamp(ANALYSIS_START):
                    relevant_terms.append(term)
        
        # *** SI NO TIENE T√âRMINOS EN 2020-2026, SKIP ***
        if not relevant_terms:
            continue
        
        # Tomar el t√©rmino m√°s reciente del per√≠odo
        latest_term = relevant_terms[-1]
        
        # Determinar si a√∫n sirve
        latest_end = pd.to_datetime(latest_term.get('end'))
        is_currently_serving = latest_end >= pd.Timestamp('2025-01-01')
        
        # Extraer info del miembro
        member_record = {
            # IDs
            'bioguide_id': bioguide_id,
            'govtrack_id': ids.get('govtrack'),
            'opensecrets_id': ids.get('opensecrets'),
            'fec_ids': str(ids.get('fec', [])),
            'wikipedia_id': ids.get('wikipedia'),
            'wikidata_id': ids.get('wikidata'),
            
            # Nombre
            'first_name': name.get('first'),
            'last_name': name.get('last'),
            'middle_name': name.get('middle'),
            'full_name': f"{name.get('first', '')} {name.get('last', '')}".strip(),
            'official_full': name.get('official_full'),
            'nickname': name.get('nickname'),
            
            # Bio
            'birthday': bio.get('birthday'),
            'gender': bio.get('gender'),
            'religion': bio.get('religion'),
            
            # Info pol√≠tica del t√©rmino m√°s reciente en el per√≠odo
            'party': latest_term.get('party'),
            'state': latest_term.get('state'),
            'type': latest_term.get('type'),
            'district': latest_term.get('district'),
            'senate_class': latest_term.get('class'),
            'state_rank': latest_term.get('state_rank'),
            
            # Fechas
            'first_relevant_term_start': relevant_terms[0].get('start'),
            'latest_term_start': latest_term.get('start'),
            'latest_term_end': latest_term.get('end'),
            'num_relevant_terms': len(relevant_terms),
            
            # Status
            'is_current': is_currently_serving,
            'source': legislator.get('source'),
            
            # Contacto (del t√©rmino m√°s reciente)
            'phone': latest_term.get('phone'),
            'fax': latest_term.get('fax'),
            'address': latest_term.get('address'),
            'office': latest_term.get('office'),
            'website': latest_term.get('url'),
            'contact_form': latest_term.get('contact_form'),
            'rss_url': latest_term.get('rss_url'),
            
            # Social media
            'twitter': ids.get('twitter'),
            'facebook': ids.get('facebook'),
            'youtube': ids.get('youtube'),
            'instagram': ids.get('instagram'),
        }
        
        members_data.append(member_record)
        
        # Extraer comit√©s de TODOS los t√©rminos relevantes
        for term in relevant_terms:
            term_start = pd.to_datetime(term.get('start'))
            term_end = pd.to_datetime(term.get('end'))
            
            # Calcular congresos
            start_year = term_start.year
            end_year = term_end.year
            start_congress = (start_year - 1789) // 2 + 1
            end_congress = (end_year - 1789) // 2 + 1
            
            # Comit√©s
            for committee in term.get('committees', []):
                for congress in range(start_congress, end_congress + 1):
                    if congress in RELEVANT_CONGRESSES:
                        committees_data.append({
                            'bioguide_id': bioguide_id,
                            'congress': congress,
                            'committee': committee,
                            'term_start': term.get('start'),
                            'term_end': term.get('end'),
                            'party': term.get('party'),
                            'state': term.get('state')
                        })
        
    except Exception as e:
        continue

# Crear DataFrames
members_df = pd.DataFrame(members_data)
committees_df = pd.DataFrame(committees_data)



# ============================================================
# PASO 3: FEATURE ENGINEERING
# ============================================================



# Calcular edad
members_df['birthday'] = pd.to_datetime(members_df['birthday'], errors='coerce')
members_df['age_2024'] = 2024 - members_df['birthday'].dt.year
members_df['age_2020'] = 2020 - members_df['birthday'].dt.year

# Parsear fechas de t√©rminos
members_df['first_relevant_term_start'] = pd.to_datetime(members_df['first_relevant_term_start'])
members_df['latest_term_start'] = pd.to_datetime(members_df['latest_term_start'])
members_df['latest_term_end'] = pd.to_datetime(members_df['latest_term_end'])

# Calcular seniority (a√±os desde primer t√©rmino relevante)
members_df['seniority_years'] = (
    (members_df['latest_term_end'] - members_df['first_relevant_term_start']).dt.days / 365
).round(2)

# Chamber
members_df['chamber'] = members_df['type'].map({
    'sen': 'Senate',
    'rep': 'House'
})

# Party clean
members_df['party_clean'] = members_df['party'].map({
    'Republican': 'R',
    'Democrat': 'D',
    'Independent': 'I'
})

# Flags
members_df['is_senate'] = members_df['type'] == 'sen'
members_df['is_house'] = members_df['type'] == 'rep'

# Retirement info
members_df['retirement_year'] = members_df['latest_term_end'].dt.year
members_df['retired_during_period'] = (
    (~members_df['is_current']) & 
    (members_df['retirement_year'] >= 2020)
)


# ============================================================
# PASO 4: AGREGAR COMIT√âS
# ============================================================


if not committees_df.empty:
    # Lista de comit√©s √∫nicos por miembro
    committees_by_member = committees_df.groupby('bioguide_id')['committee'].apply(
        lambda x: '; '.join(sorted(set(x)))
    ).reset_index()
    committees_by_member.columns = ['bioguide_id', 'committees_list']
    
    # Contar
    committees_count = committees_df.groupby('bioguide_id')['committee'].nunique().reset_index()
    committees_count.columns = ['bioguide_id', 'num_committees']
    
    # Merge
    members_df = members_df.merge(committees_by_member, on='bioguide_id', how='left')
    members_df = members_df.merge(committees_count, on='bioguide_id', how='left')
    
    print(f"‚úÖ Comit√©s agregados a {members_df['committees_list'].notna().sum()} miembros")
else:
    members_df['committees_list'] = None
    members_df['num_committees'] = 0

# ============================================================
# PASO 5: ORGANIZAR Y EXPORTAR
# ============================================================



# Ordenar columnas
column_order = [
    # IDs
    'bioguide_id', 'govtrack_id', 'opensecrets_id', 'fec_ids', 'wikipedia_id',
    
    # Personal
    'full_name', 'first_name', 'last_name', 'middle_name', 'nickname',
    'official_full', 'birthday', 'age_2024', 'age_2020', 'gender', 'religion',
    
    # Political
    'party', 'party_clean', 'state', 'chamber', 'type', 'district',
    'senate_class', 'state_rank',
    
    # Terms & Status
    'first_relevant_term_start', 'latest_term_start', 'latest_term_end',
    'num_relevant_terms', 'seniority_years', 
    'is_current', 'retirement_year', 'retired_during_period',
    
    # Committees
    'committees_list', 'num_committees',
    
    # Flags
    'is_senate', 'is_house',
    
    # Source
    'source',
    
    # Contact
    'phone', 'fax', 'address', 'office', 'website', 'contact_form', 'rss_url',
    
    # Social Media
    'twitter', 'facebook', 'youtube', 'instagram'
]

existing_cols = [col for col in column_order if col in members_df.columns]
members_final = members_df[existing_cols].copy()

# Export principal
output_file = 'members_2020_2026.csv'
members_final.to_csv(output_file, index=False)
print(f"\n‚úÖ Dataset principal guardado: {output_file}")
print(f"   Filas: {len(members_final)}")
print(f"   Columnas: {len(members_final.columns)}")

# Export comit√©s detallados
if not committees_df.empty:
    committees_output = 'committees_by_congress_2020_2026.csv'
    committees_df.to_csv(committees_output, index=False)
    print(f"\n‚úÖ Comit√©s detallados guardados: {committees_output}")
    print(f"   Filas: {len(committees_df)}")




‚Üí Descargando legislators-historical.yaml...
   ‚úÖ Descargado: 12225 miembros desde https://raw.githubusercontent.com/unitedstates/con...

‚Üí Descargando legislators-current.yaml...
   ‚úÖ Descargado: 537 miembros desde https://raw.githubusercontent.com/unitedstates/con...

‚úÖ Dataset principal guardado: members_2020_2026.csv
   Filas: 794
   Columnas: 48


In [2]:
"""
INTEGRAR COMIT√âS 
"""
import pandas as pd
import requests
import yaml

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}



# 1. Cargar dataset existente

members = pd.read_csv('members_2020_2026.csv')


# 2. Descargar committee-membership

url_membership = 'https://raw.githubusercontent.com/unitedstates/congress-legislators/master/committee-membership-current.yaml'

try:
    response = requests.get(url_membership, headers=HEADERS, timeout=60)
    response.raise_for_status()
    membership_data = yaml.safe_load(response.text)
    print(f"‚úÖ {len(membership_data)} comit√©s descargados")
except Exception as e:
    
    exit()

# 3. Descargar committees-current para mapear IDs a nombres

url_committees = 'https://raw.githubusercontent.com/unitedstates/congress-legislators/master/committees-current.yaml'

committee_names = {}

try:
    response = requests.get(url_committees, headers=HEADERS, timeout=60)
    response.raise_for_status()
    committees_data = yaml.safe_load(response.text)
    
    # Crear mapeo de ID ‚Üí nombre
    for committee in committees_data:
        if isinstance(committee, dict):
            thomas_id = committee.get('thomas_id', '')
            name = committee.get('name', thomas_id)
            
            if thomas_id:
                committee_names[thomas_id] = name
    
    
    
except Exception as e:
    print(f"‚ö†Ô∏è  No se pudo descargar nombres de comit√©s: {e}")
    print("   Usando IDs en lugar de nombres")

# 4. Extraer membres√≠as


bioguide_to_committees = {}
total_memberships = 0

for committee_id, members_list in membership_data.items():
    # Intentar obtener nombre del comit√©
    committee_name = committee_names.get(committee_id, committee_id)
    
    if isinstance(members_list, list):
        for member in members_list:
            if isinstance(member, dict) and 'bioguide' in member:
                bioguide = member['bioguide']
                
                if bioguide not in bioguide_to_committees:
                    bioguide_to_committees[bioguide] = []
                
                bioguide_to_committees[bioguide].append(committee_name)
                total_memberships += 1



# 5. Crear DataFrame de comit√©s

committees_df = pd.DataFrame([
    {
        'bioguide_id': bioguide,
        'committees_list': '; '.join(sorted(set(committees))),
        'num_committees': len(set(committees))
    }
    for bioguide, committees in bioguide_to_committees.items()
])


# 6. Merge con dataset principal

# Eliminar columnas viejas si existen
if 'committees_list' in members.columns:
    members = members.drop('committees_list', axis=1)
if 'num_committees' in members.columns:
    members = members.drop('num_committees', axis=1)

# Merge
members = members.merge(
    committees_df,
    on='bioguide_id',
    how='left'
)

# Rellenar NaN con 0 para num_committees
members['num_committees'] = members['num_committees'].fillna(0).astype(int)

# 7. Guardar

output_file = 'members_2020_2026_WITH_COMMITTEES.csv'
members.to_csv(output_file, index=False)



# 8. Verificaci√≥n
bioguides_in_dataset = set(members['bioguide_id'].values)
bioguides_in_committees = set(bioguide_to_committees.keys())

matched = bioguides_in_dataset & bioguides_in_committees

print(f"   Miembros en dataset: {len(bioguides_in_dataset)}")
print(f"   Miembros en comit√©s: {len(bioguides_in_committees)}")
print(f"   Match exitoso: {len(matched)}")
print(f"   En dataset pero sin comit√©s: {len(bioguides_in_dataset - bioguides_in_committees)}")

print(f"\nüìÅ Archivo guardado: {output_file}")
print(f"\nüéØ ¬°DATASET COMPLETO CON COMIT√âS!")

print("\n" + "=" * 70)

‚úÖ 230 comit√©s descargados
   Miembros en dataset: 794
   Miembros en comit√©s: 530
   Match exitoso: 530
   En dataset pero sin comit√©s: 264

üìÅ Archivo guardado: members_2020_2026_WITH_COMMITTEES.csv

üéØ ¬°DATASET COMPLETO CON COMIT√âS!

