In [1]:
import random
import pandas as pd
from faker import Faker
import numpy as np
from datetime import date

In [2]:
fake = Faker('zh_CN')

In [3]:
def generate_start_year_month():
    start_year = random.randint(2011, 2025)
    rand_num = random.random()
    if rand_num < 0.18:
        start_month = random.randint(1, 3)
    elif rand_num < 0.5:
        start_month = random.randint(4, 6)
    elif rand_num < 0.85:
        start_month = random.randint(8, 10)
    else:
        start_month = random.choice([7, 11, 12])
    
    return start_year, start_month

In [4]:
def generate_stay_duration():
    stay_years = max(0, int(np.random.normal(1, 2)))
    stay_months = max(0, int(np.random.normal(3, 3)))
    return stay_years, stay_months

In [5]:
def generate_education():
    education_levels = ['初中以下', '初中', '高中', '专科', '本科', '研究生', '博士', '其他']
    education_probabilities = [0.405, 0.25, 0.2, 0.05, 0.03, 0.015, 0.001, 0.049]  
    return str(np.random.choice(education_levels, p=education_probabilities))

In [6]:
def generate_gender():
    return '男' if random.random() < 0.55 else '女'

def generate_name(gender):
    return fake.name_male() if gender == "男" else fake.name_female()

def generate_age():
    age = max(18, min(100, int(np.random.normal(35, 15))))
    return age

In [7]:
area_codes = {
    "北京市": "1100", "天津市": "1200", "河北省": "1300", "山西省": "1400",
    "内蒙古自治区": "1500", "辽宁省": "2100", "吉林省": "2200", "黑龙江省": "2300",
    "上海市": "3100", "江苏省": "3200", "浙江省": "3300", "安徽省": "3400",
    "福建省": "3500", "江西省": "3600", "山东省": "3700", "河南省": "4100"
}


def generate_huji():
    return random.choice(list(area_codes.keys()))


def hukou_location(hukou):
    address = fake.address()
    while hukou not in address:
        if len(address) > 100:
            break
        address = fake.address()
    return address

In [8]:
def generate_id_card(age, gender, area):
    current_year = date.today().year
    birth_year = current_year - age
    
    # 随机选择出生月份和日期
    birth_month = random.randint(1, 12)
    birth_day = random.randint(1, 28) 

    # 性别相关数字
    gender_digit = random.choice([0, 2, 4, 6, 8]) if gender == '女' else random.choice([1, 3, 5, 7, 9])
    
    # 随机选择地区后两位
    id_area_code = area_codes[area] + f"{random.randint(0, 99):02d}"
    
    # 组装出生日期字符串
    birth_str = f"{birth_year:04d}{birth_month:02d}{birth_day:02d}"
    
    # 随机构造身份证号码中的顺序码
    random_digits = f"{random.randint(0, 9)}{random.randint(0, 9)}{gender_digit}"
    
    # 组合身份证号码基础部分
    id_card_base = f"{id_area_code}{birth_str}{random_digits}"
    
    # 计算校验码
    check_code = calculate_check_code(id_card_base)
    
    # 返回完整的身份证号码
    return f"{id_card_base}{check_code}"

In [9]:
def calculate_check_code(id_card_base):
    factors = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
    check_code_map = '10X98765432'
    sum_product = sum([int(id_card_base[i]) * factors[i] for i in range(len(factors))])
    return check_code_map[sum_product % 11]


def generate_job_type():
    job_types = ['无业', '其他', '服务业', '经商', '务农', '务工', '教育', '医疗', '建筑', '科技', '文化艺术', '金融', '运输']
    job_probabilities = [0.15, 0.1043, 0.099, 0.034, 0.034, 0.4427, 0.03, 0.02, 0.015, 0.02, 0.015, 0.02, 0.016]
    return str(np.random.choice(job_types, p=job_probabilities))

In [13]:
def generate_data(size):
    data = []
    for _ in range(size):
        start_year, start_month = generate_start_year_month()
        stay_years, stay_months = generate_stay_duration()
        user_id = random.randint(1000000000, 9999999999)
        phone = fake.phone_number()
        gender = generate_gender()
        name = generate_name(gender)
        age = generate_age()
        education = generate_education()
        huji = generate_huji()
        id_number = generate_id_card(age, gender, huji)
        address = hukou_location(huji)
        job_type = generate_job_type()
        user_type='固定工作' if job_type in ['科技','务工','金融','医疗','教育'] else '无固定工作'
        total_months = stay_years * 12 + stay_months
        
        for month in range(total_months):
            current_year = start_year + (start_month + month - 1) // 12
            current_month = (start_month + month - 1) % 12 + 1
            if current_year > 2024:
                break
            month = month + random.randint(0, 3)
            data.append([
                current_year, current_month, user_id, phone, name, gender,
                age, education, huji, id_number, address, job_type, user_type
            ])
    
    return pd.DataFrame(data, columns=['year', 'month', 'user_id', 'phone', 'name', 'gender',
                                       'age', 'education', 'huji', 'id_number', 'address', 'job_type', 'user_type'])


In [14]:
from ydata_profiling import ProfileReport

In [15]:
if __name__ == "__main__":
    total_users = 5000
    df = generate_data(total_users)
    profile = ProfileReport(df, title="users Report")
    profile.to_file("users_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]