In [13]:
import numpy as np
import pandas as pd

# 总人口基数
total_population = 1_000_000  # 模拟 100 万男性样本

age_weights = np.concatenate([
    np.full(17, 0.2),   # 0-16 岁，约 20%
    np.full(44, 0.5),   # 17-60 岁，约 50%
    np.full(60, 0.3)    # 61-120 岁，约 30%
])
age_weights = age_weights / age_weights.sum() 

# 模拟候选人数据
np.random.seed(42)
df = pd.DataFrame({
    "age": np.random.choice(range(121), size=total_population, p=age_weights), 
    "height": np.random.randint(150, 201, size=total_population),  # 身高 150-200 cm
    "education": np.random.choice(
        ["初中及以下", "高中", "大专", "本科", "研究生", "博士"],
        size=total_population,
        p=[0.4, 0.3, 0.2, 0.07, 0.025, 0.005]
    ),
    "family_asset": np.random.choice(
        ["<10万", "10-50万", "50-100万", "100-500万", ">500万"],
        size=total_population,
        p=[0.6, 0.25, 0.1, 0.04, 0.01]
    ),
    "personal_income": np.random.choice(
        ["<10万", "10-50万", "50-100万", "100-500万", ">500万"],
        size=total_population,
        p=[0.6, 0.25, 0.1, 0.04, 0.01]
    ),
    "face_score": np.random.randint(1, 5, size=total_population),  # 1-4 分
    "humor_score": np.random.randint(1, 5, size=total_population),
    "sex_attract_score": np.random.randint(1, 5, size=total_population),
    "body_score": np.random.randint(1, 5, size=total_population),
    "health_status": np.random.choice(["健康", "健康状况不佳"], size=total_population, p=[0.9, 0.1]),
    "religion": np.random.choice(["无信仰", "有宗教信仰"], size=total_population, p=[0.95, 0.05]),
    "marital_status": np.random.choice(
        ["未婚", "离异无孩子", "离异有孩子", "已婚"],
        size=total_population,
        p=[0.8, 0.1, 0.08, 0.02]
    ),
    "property_status": np.random.choice(
        ["无房产", "有房有贷款", "有房无贷款"],
        size=total_population,
        p=[0.5, 0.3, 0.2]
    ),
    "hometown": np.random.choice(
        ["农村", "县城", "一线城市", "二线城市"],
        size=total_population,
        p=[0.5, 0.3, 0.1, 0.1]
    ),
    "current_location": np.random.choice(
        ["农村", "县城", "一线城市", "二线城市"],
        size=total_population,
        p=[0.4, 0.3, 0.15, 0.15]
    ),
    "smoking_habit": np.random.choice(
        ["不吸烟", "偶尔吸烟", "经常吸烟"],
        size=total_population,
        p=[0.7, 0.2, 0.1]
    ),
    "drinking_habit": np.random.choice(
        ["禁酒", "偶尔喝", "经常喝"],
        size=total_population,
        p=[0.5, 0.4, 0.1]
    ),
    "vision": np.random.choice(
        ["不近视", "近视低于400度", "近视高于400度"],
        size=total_population,
        p=[0.3, 0.5, 0.2]
    )
})

In [14]:
data = df.copy()

# 数值化映射
columns_to_encode = [
    "education", "family_asset", "personal_income", "health_status",
    "religion", "marital_status", "property_status",
    "hometown", "current_location", "smoking_habit", "drinking_habit", "vision"
]

mappings = {}
for col in columns_to_encode:
    unique_values = data[col].unique()
    mappings[col] = {value: idx for idx, value in enumerate(unique_values)}

# 替换原数据为数字编码
for col, mapping in mappings.items():
    data[col] = data[col].map(mapping)

# 保存数值化后的数据
data.to_csv("data/data_numeric.csv", index=False)
print("Numeric data saved to 'data_numeric.csv'.")

# 保存映射表
import json
with open("data/mappings.json", "w", encoding="utf-8") as f:
    json.dump(mappings, f, ensure_ascii=False, indent=4)
print("Mappings saved to 'mappings.json'.")

# 保存为压缩格式
data.to_parquet("data/data_numeric.parquet", index=False, compression="snappy")
print("Data saved to 'data_numeric.parquet'.")

# 保存为紧凑的 JSON 格式
data.to_json("data/data_numeric.json", orient="records", force_ascii=False)
print("Data saved to 'data_numeric.json'.")


Numeric data saved to 'data_numeric.csv'.
Mappings saved to 'mappings.json'.
Data saved to 'data_numeric.parquet'.
Data saved to 'data_numeric.json'.


In [15]:
data.sample(10)

Unnamed: 0,age,height,education,family_asset,personal_income,face_score,humor_score,sex_attract_score,body_score,health_status,religion,marital_status,property_status,hometown,current_location,smoking_habit,drinking_habit,vision
994118,91,200,3,1,1,1,1,2,1,1,0,0,2,0,2,1,0,2
135515,28,176,2,1,2,1,4,4,3,1,0,1,0,2,2,0,0,2
586425,22,183,2,1,2,4,2,1,4,1,0,0,0,2,2,1,0,1
760223,54,171,1,0,1,3,1,2,4,1,0,0,2,2,3,1,0,1
46483,60,172,3,2,1,4,1,4,2,1,0,0,2,1,1,1,1,2
686687,103,159,3,1,1,4,2,3,3,1,0,0,2,0,2,1,1,2
275259,39,198,3,1,2,2,2,4,3,1,0,2,0,3,0,1,1,2
582943,32,155,3,1,1,2,2,4,1,1,0,0,2,0,0,0,0,2
178578,40,200,1,1,1,2,2,3,1,1,0,0,2,1,2,1,0,2
565753,45,182,1,1,2,2,4,1,3,0,0,0,0,0,3,2,2,0


In [16]:
def filter_candidates(data, criteria):
    filtered = data[
        (data["age"] >= criteria["age"][0]) & (data["age"] <= criteria["age"][1]) &
        (data["height"] >= criteria["height"][0]) & (data["height"] <= criteria["height"][1]) &
        (data["education"] >= criteria["min_education"]) &
        (data["family_asset"] >= criteria["min_family_asset"]) &
        (data["personal_income"] >= criteria["min_personal_income"]) &
        (data["face_score"] >= criteria["min_face_score"]) &
        (data["humor_score"] >= criteria["min_humor_score"]) &
        (data["sex_attract_score"] >= criteria["min_sex_attract_score"]) &
        (data["body_score"] >= criteria["min_body_score"]) &
        (data["health_status"] == criteria["health_status"]) &
        (data["religion"] == criteria["religion"]) &
        (data["marital_status"].isin(criteria["marital_status"])) &
        (data["property_status"].isin(criteria["property_status"])) &
        (data["hometown"].isin(criteria["hometown"])) &
        (data["current_location"].isin(criteria["current_location"])) &
        (data["smoking_habit"].isin(criteria["smoking_habit"])) &
        (data["drinking_habit"].isin(criteria["drinking_habit"])) &
        (data["vision"].isin(criteria["vision"]))
    ]
    return filtered


In [17]:
criteria = {
    "age": (25, 35),  # 年龄区间
    "height": (170, 185),  # 身高区间
    "min_education": "本科",  # 最低受教育程度
    "min_family_asset": "50-100万",  # 最低家庭净资产
    "min_personal_income": "50-100万",  # 最低个人年收入
    "min_face_score": 2,  # 最低颜值评分
    "min_humor_score": 2,  # 最低幽默评分
    "min_sex_attract_score": 2,  # 最低性吸引力评分
    "min_body_score": 2,  # 最低身材评分
    "health_status": "健康",  # 健康状况
    "religion": "无信仰",  # 宗教信仰
    "marital_status": ["未婚", "离异无孩子"],  # 婚姻状况
    "property_status": ["有房有贷款", "有房无贷款"],  # 房产拥有情况
    "hometown": ["一线城市", "二线城市"],  # 家庭所在地
    "current_location": ["一线城市", "二线城市"],  # 个人目前所在地
    "smoking_habit": ["不吸烟", "偶尔吸烟"],  # 吸烟习惯
    "drinking_habit": ["禁酒", "偶尔喝"],  # 饮酒习惯
    "vision": ["不近视", "近视低于400度"]  # 视力状况
}

In [18]:
eligible_candidates = filter_candidates(df, criteria)

print(f"总候选人数：{len(data)}")
print(f"符合条件人数：{len(eligible_candidates)}")
print(f"符合条件比例：{len(eligible_candidates) / len(data):.4%}")

总候选人数：1000000
符合条件人数：40
符合条件比例：0.0040%
