# 兑现模式下的特征工程

In [None]:
# set paths
from paths import DATA_DIR

In [7]:
# 导入所需库
import pandas as pd
import random
import json
import psycopg2
import os
from itertools import product
from datetime import datetime
from tqdm import tqdm

## 1. 数据获取
### 1.1 获取兑现样本数据

In [8]:
connection = psycopg2.connect(
    user=os.getenv("PG_USER"),
    password=os.getenv("PG_PASSWORD"),
    host=os.getenv("PG_HOST"),
    port=os.getenv("PG_PORT"),
    database=os.getenv("PG_DB")
)

cursor = connection.cursor()

In [9]:
# 读取议题数据
label_issue = pd.read_excel(DATA_DIR / "议题数据.xlsx", sheet_name="Sheet1")

# 定义搜索查询条件
search_query = [
    "(text LIKE '%agriculture%' OR text LIKE '%food%')", 
    "(text LIKE '%armed forces%')",
    "(text LIKE '%national security%')",
    "(text LIKE '%art%' OR text LIKE '%culture%' OR text LIKE '%religion%')",
    "(text LIKE '%civil rights%' OR text LIKE '%liberties%' OR text LIKE '%minority issues%')", 
    "(text LIKE '%commerce%')", 
    "(text LIKE '%congressional%' AND text LIKE '%operations%')", 
    "(text LIKE '%crime%' OR text LIKE '%law enforcement%')", 
    "(text LIKE '%economics%' OR text LIKE '%public finance%')", 
    "(text LIKE '%education%')", 
    "(text LIKE '%emergency management%')", 
    "(text LIKE '%energy%')", 
    "(text LIKE '%environmental protection%')", 
    "(text LIKE '%finance%' OR text LIKE '%financial sector%')", 
    "(text LIKE '%international trade%')", 
    "(text LIKE '%international finance%')", 
    "(text LIKE '%government operations%' OR text LIKE '%politics%')", 
    "(text LIKE '%health%')", 
    "(text LIKE '%housing%' OR text LIKE '%community development%')", 
    "(text LIKE '%immigration%')", 
    "(text LIKE '%alliance%' OR text LIKE '%collective security%')", 
    "(text LIKE '%human rights%')", 
    "(text LIKE '%foreign affairs%')", 
    "(text LIKE '%labor%' OR text LIKE '%employment%')", 
    "(text LIKE '%law%')", 
    "(text LIKE '%public lands%' OR text LIKE '%natural resources%')", 
    "(text LIKE '%science%' OR text LIKE '%technology%' OR text LIKE '%communications%')", 
    "(text LIKE '%social science%' OR text LIKE '%history%')", 
    "(text LIKE '%social welfare%')", 
    "(text LIKE '%taxation%')", 
    "(text LIKE '%transportation%' OR text LIKE '%public works%')", 
    "(text LIKE '%water resources%' AND text LIKE '%development%')", 
]
# SQL基础查询
select_query = "SELECT congress, number, type FROM texts WHERE "
condition = " AND congress >= 115 AND type IN ('HR', 'S')"   # 可能需要检查这里的类型


In [10]:
def get_cash_data():
    """获取兑现样本数据"""
    cash_data_list = []
    
    # 遍历每个搜索查询
    for j, query_part in enumerate(search_query):
        try:
            # 构造并执行查询
            query = select_query + query_part + condition
            cursor.execute(query)
            rows = cursor.fetchall()
        except Exception as e:
            print(f"执行查询时出错: {query}")
            print(e)
            continue
        
        # 去除重复项
        origin_data = list(set(rows))
        
        if not origin_data:
            continue
        
        # 构造批量查询bills表
        values_clause = " OR ".join([f"(congress = {origin[0]} AND type = '{origin[2]}' AND number = {origin[1]})" for origin in origin_data])
        batch_query = f"""
        SELECT 
            congress, 
            introduced_date, 
            jsonb_array_elements(sponsors)->>'bioguideId' AS sponsor_id,
            jsonb_array_elements(cosponsors)->>'bioguideId' AS cosponsor_id
        FROM bills 
        WHERE {values_clause}
        """
        
        try:
            cursor.execute(batch_query)
            data = cursor.fetchall()
            if not data:
                continue
            
            issue = label_issue.iloc[j, 0]
            
            for row in data:
                congress, introduced_date, sponsor_id, cosponsor_id = row
                if sponsor_id:
                    cash_data_list.append([congress, introduced_date, sponsor_id, issue])
                if cosponsor_id:
                    cash_data_list.append([congress, introduced_date, cosponsor_id, issue])
        except Exception as e:
            print(f"执行批量查询时出错: {batch_query}")
            print(e)
            continue
    
    return cash_data_list

# 获取兑现数据
cash_data_list = get_cash_data()

In [11]:
# 关闭数据库连接
cursor.close()
connection.close()

In [12]:
# 将结果转换为DataFrame并保存
cash_data = pd.DataFrame(cash_data_list, columns=["congress", "introduced_date", "member", "issue"])

# 提取年份和月份
cash_data["year"] = cash_data["introduced_date"].apply(lambda x: str(x).split("-")[0])
cash_data["month"] = cash_data["introduced_date"].apply(lambda x: str(x).split("-")[1])

# 重命名列
cash_data = cash_data.rename(columns={"member": "mid"})

# 保存到CSV文件
cash_data.to_csv(DATA_DIR / "115-118_congress_data.csv", index=False)