### 议员选区数据确定

In [None]:
# set paths
from paths import DATA_DIR

In [17]:
# 导入所需库
import pandas as pd
import random
import json
import psycopg2
import os
from itertools import product
from datetime import datetime
from tqdm import tqdm

In [18]:
connection = psycopg2.connect(
    user=os.getenv("PG_USER"),
    password=os.getenv("PG_PASSWORD"),
    host=os.getenv("PG_HOST"),
    port=os.getenv("PG_PORT"),
    database=os.getenv("PG_DB")
)

cursor = connection.cursor()

In [19]:
# 获取目标议员的唯一标识列表

# 构建查询字符串，直接在 SQL 中处理数据
query = f"""
    SELECT 
        bioguide_id, 
        full_name, 
        chamber, 
        CASE 
            WHEN birth_year IS NOT NULL THEN 2024 - birth_year 
            ELSE NULL 
        END AS age, 
        party, 
        state_code, 
        district, 
        CASE 
            WHEN name_info::jsonb ? 'honor_name' THEN name_info::jsonb->>'honor_name' 
            ELSE NULL 
        END AS honor,
        terms
    FROM 
        members
"""

# 执行查询并获取数据
cursor.execute(query)
members_data = cursor.fetchall()

In [20]:
# 关闭数据库连接
cursor.close()
connection.close()

In [21]:
# 将数据转换为 DataFrame
columns = ["bioguide_id", "full_name", "chamber", "age", "party", "state_code", "district", "honor","terms"]
df_members = pd.DataFrame(members_data, columns=columns)


In [22]:
import json
import pandas as pd

# 获取第一行的terms数据
first_member_terms = df_members['terms'].iloc[0]
first_member_name = df_members['full_name'].iloc[0]
first_member_id = df_members['bioguide_id'].iloc[0]

print(f"=== 议员信息 ===")
print(f"姓名: {first_member_name}")
print(f"ID: {first_member_id}")

print(f"\n=== Terms原始数据类型 ===")
print(f"数据类型: {type(first_member_terms)}")

# 解析terms数据
if isinstance(first_member_terms, str):
    try:
        terms_data = json.loads(first_member_terms)
        print(f"\n=== {first_member_name} 的任期信息 ===")
        
        # 转换为DataFrame便于查看
        terms_df = pd.DataFrame(terms_data)
        print(terms_df.to_string(index=False))
        
        print(f"\n=== 任期统计 ===")
        print(f"总任期数: {len(terms_df)}")
        if 'congress' in terms_df.columns:
            print(f"国会届次范围: {terms_df['congress'].min()} - {terms_df['congress'].max()}")
        if 'chamber' in terms_df.columns:
            print(f"议院: {terms_df['chamber'].unique()}")
        
    except json.JSONDecodeError as e:
        print(f"JSON解析失败: {e}")
        print(f"原始数据: {first_member_terms}")
else:
    print(f"Terms数据已经是解析后的格式:")
    if isinstance(first_member_terms, list):
        terms_df = pd.DataFrame(first_member_terms)
        print(terms_df.to_string(index=False))
    else:
        print(first_member_terms)

=== 议员信息 ===
姓名: Rep. Clarke, James McC. [D-NC-11]
ID: C000462

=== Terms原始数据类型 ===
数据类型: <class 'list'>
Terms数据已经是解析后的格式:
                 chamber  endYear  congress  district  startYear stateCode      stateName     memberType
House of Representatives     1985        98        11       1983        NC North Carolina Representative
House of Representatives     1989       100        11       1987        NC North Carolina Representative
House of Representatives     1991       101        11       1989        NC North Carolina Representative


In [23]:
import json
import pandas as pd

def expand_member_terms(df_members):
    """
    展开df_members中的terms数据，创建包含每个议员每届任期信息的新DataFrame
    
    参数:
    df_members: 包含bioguide_id和terms列的DataFrame
    
    返回:
    expanded_df: 展开后的DataFrame，包含bioguide_id, congress, chamber, stateCode, district列
    """
    expanded_rows = []
    
    for idx, row in df_members.iterrows():
        bioguide_id = row['bioguide_id']
        terms_data = row['terms']
        
        # 确保terms_data是列表格式
        if isinstance(terms_data, str):
            try:
                terms_list = json.loads(terms_data)
            except json.JSONDecodeError:
                continue  # 跳过无法解析的数据
        elif isinstance(terms_data, list):
            terms_list = terms_data
        else:
            continue  # 跳过其他格式的数据
        
        # 遍历每个任期，提取需要的信息
        for term in terms_list:
            # 提取需要的字段
            congress = term.get('congress')
            chamber = term.get('chamber')
            state_code = term.get('stateCode')
            district = term.get('district')
            
            # 只添加有congress信息的记录
            if congress is not None:
                expanded_rows.append({
                    'bioguide_id': bioguide_id,
                    'congress': congress,
                    'chamber': chamber,
                    'stateCode': state_code,
                    'district': district
                })
    
    # 创建新的DataFrame
    expanded_df = pd.DataFrame(expanded_rows)
    
    return expanded_df

# 创建展开的DataFrame
expanded_terms_df = expand_member_terms(df_members)

print(f"=== 展开结果统计 ===")
print(f"原始议员数量: {len(df_members)}")
print(f"展开后记录数量: {len(expanded_terms_df)}")
print(f"包含的国会届次: {sorted(expanded_terms_df['congress'].unique())}")
print(f"包含的议院: {expanded_terms_df['chamber'].unique()}")

=== 展开结果统计 ===
原始议员数量: 2228
展开后记录数量: 15251
包含的国会届次: [73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119]
包含的议院: ['House of Representatives' 'Senate']


In [24]:

print(f"\n=== 前10行展开结果 ===")
print(expanded_terms_df.head(10))

print(f"\n=== 按congress统计 ===")
print(expanded_terms_df['congress'].value_counts().sort_index())

print(f"\n=== 按chamber统计 ===")
print(expanded_terms_df['chamber'].value_counts())

# 保存展开的数据
expanded_terms_df.to_csv(DATA_DIR / "议员选区数据.csv", index=False)
print(f"\n展开的议员任期数据已保存到: ./data/LightGBM_raw/议员选区数据.csv")

# 显示列信息
print(f"\n=== 新DataFrame列信息 ===")
print(f"列名: {list(expanded_terms_df.columns)}")
print(f"数据类型:")
print(expanded_terms_df.dtypes)


=== 前10行展开结果 ===
  bioguide_id  congress                   chamber stateCode  district
0     C000462        98  House of Representatives        NC      11.0
1     C000462       100  House of Representatives        NC      11.0
2     C000462       101  House of Representatives        NC      11.0
3     G000132        95  House of Representatives        MO       3.0
4     G000132        96  House of Representatives        MO       3.0
5     G000132        97  House of Representatives        MO       3.0
6     G000132        98  House of Representatives        MO       3.0
7     G000132        99  House of Representatives        MO       3.0
8     G000132       100  House of Representatives        MO       3.0
9     G000132       101  House of Representatives        MO       3.0

=== 按congress统计 ===
congress
73       1
74       2
75       3
76       4
77       7
78       9
79      12
80      11
81      21
82      20
83      28
84      34
85      42
86      60
87      80
88     110
89    