In [2]:
import os
import re
import string
import random
import pyodbc
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from datetime import datetime, timedelta
from vn_fullname_generator import generator
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
load_dotenv()

True

In [4]:
def get_db_connection(DB_NAME:str):
    conn = pyodbc.connect(
        f"DRIVER={os.getenv('DB_DRIVER')};"
        f"SERVER={os.getenv('DB_SERVER')};"
        f"DATABASE={DB_NAME};"
        f"UID={os.getenv('DB_UID')};"
        f"PWD={os.getenv('DB_PWD')};"
        f"TrustServerCertificate={os.getenv('TRUST_SERVER_CERTIFICATE')};"
    )
    
    return conn


In [5]:
def normalize_name(name):
    vietnamese_chars = {
        'à': 'a', 'á': 'a', 'ả': 'a', 'ã': 'a', 'ạ': 'a',
        'ă': 'a', 'ằ': 'a', 'ắ': 'a', 'ẳ': 'a', 'ẵ': 'a', 'ặ': 'a',
        'â': 'a', 'ầ': 'a', 'ấ': 'a', 'ẩ': 'a', 'ẫ': 'a', 'ậ': 'a',
        'đ': 'd',
        'è': 'e', 'é': 'e', 'ẻ': 'e', 'ẽ': 'e', 'ẹ': 'e',
        'ê': 'e', 'ề': 'e', 'ế': 'e', 'ể': 'e', 'ễ': 'e', 'ệ': 'e',
        'ì': 'i', 'í': 'i', 'ỉ': 'i', 'ĩ': 'i', 'ị': 'i',
        'ò': 'o', 'ó': 'o', 'ỏ': 'o', 'õ': 'o', 'ọ': 'o',
        'ô': 'o', 'ồ': 'o', 'ố': 'o', 'ổ': 'o', 'ỗ': 'o', 'ộ': 'o',
        'ơ': 'o', 'ờ': 'o', 'ớ': 'o', 'ở': 'o', 'ỡ': 'o', 'ợ': 'o',
        'ù': 'u', 'ú': 'u', 'ủ': 'u', 'ũ': 'u', 'ụ': 'u',
        'ư': 'u', 'ừ': 'u', 'ứ': 'u', 'ử': 'u', 'ữ': 'u', 'ự': 'u',
        'ỳ': 'y', 'ý': 'y', 'ỷ': 'y', 'ỹ': 'y', 'ỵ': 'y'
    }

    name = name.lower()
    for vn_char, latin_char in vietnamese_chars.items():
        name = name.replace(vn_char, latin_char)

    name = re.sub(r'[^a-z ]','', name)

    return name

def generate_email(name):
    normalized = normalize_name(name)
    words = normalized.split()

    email_format = random.choice([
        f'{words[-1]}.{words[0]}',
        f'{words[0]}.{words[-1]}',
        f'{words[0]}{words[-1]}',
        f'{words[-1]}{words[0]}',
        f'{words[0][0]}{words[-1]}',
        f'{words[-1]}{words[0][0]}',
        f'{words[0]}{random.randint(1, 999)}',
        f'{words[-1]}{random.randint(1, 999)}',
        f'{words[0]}_{random.randint(1, 999)}',
        f'{words[-1]}_{random.randint(1, 999)}',
    ])

    if random.random() < 0.3:
        email_format += str(random.randint(1,999))
    
    domain = random.choice(['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com', 'icloud.com', 'gdscptit.dev', 'ptit.edu.vn', 'stu.ptit.edu.vn'])

    return f'{email_format}@{domain}'

def generate_phone_number():
    prefixes = ['032', '033', '034', '035', '036', '037', '038', '039', '096', '097', '098', '086',
                '088', '091', '094', '081', '082', '083', '084', '085',
                '070', '079', '077', '076', '078', '090', '092', '089'
                ]
    
    prefix = random.choice(prefixes)

    remaining = ''.join(str(random.randint(0,9)) for _ in range(7))

    return prefix + remaining

def generate_random_string(length=8, use_special=False):
    chars = string.ascii_letters + string.digits
    if use_special:
        safe_punctiations = '!@#$%^^&*_-'
        chars += safe_punctiations
    return ''.join(random.choices(chars, k=length))

def generate_random_date(start, end):
    start_date = datetime.strptime(start, "%Y-%m-%d")
    end_date = datetime.strptime(end, "%Y-%m-%d")

    random_days = random.randint(0, (end_date - start_date).days)

    random_date = start_date + timedelta(days=random_days)

    return random_date.strftime("%Y-%m-%d")

def generate_random_user_status():
    status = ['active', 'inactive', 'banned']
    probability = [0.7, 0.2, 0.1]
    return random.choices(status, weights=probability, k=1)[0]

In [6]:
def fetch_addresses():
    address_db = os.getenv('DB_ADDRESS')
    conn = get_db_connection(address_db)
    df = pd.read_sql("SELECT * FROM address", conn)
    conn.close()
    return df

def generate_random_address(address_df):
    row = address_df.sample(1).iloc[0]
    ward = row['wards']
    district = row['districts']
    province = row['provinces']

    return f"{ward}, {district}, {province}"

In [7]:
email_used = set()


def generate_account(id):
    try:
        username = generate_random_string(length=random.randint(5, 10))
        password = generate_random_string(length=random.randint(8, 12), use_special=True)

        gender = random.randint(0, 1)
        name = generator.generate(gender)
        
        while True:
            email = generate_email(name)
            if email not in email_used:
                email_used.add(email)
                break

        status = generate_random_user_status()

        create_at = generate_random_date("2023-01-01", datetime.now().strftime("%Y-%m-%d"))
        update_at = generate_random_date(create_at, datetime.now().strftime("%Y-%m-%d"))


        return {
            'id': id,
            'username': username,
            'password': password,
            'name': name,
            'email': email,
            'status': status,
            'created_at': create_at,
            'updated_at': update_at
        }
    
        
    
    except Exception as e:
        print('Error generating user: ', e)
        return None


In [8]:
def generate_accounts(n, max_workers=5):
    data = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(generate_account, i) for i in range(n)]
        for f in tqdm (as_completed(futures), total=n, desc="Generating account rows", unit="record"):
            result = f.result()
            if result:
                data.append(result)
    
    return pd.DataFrame(data)

In [9]:
num_rows = 10**6
max_thread = 16

account_df = generate_accounts(n=num_rows, max_workers=max_thread)

print('Account DataFrame Completed')

print('Saving to account.csv')

account_df.to_csv('./data/account.csv', index=False, encoding='utf-8-sig')

print('CSV file saved successfully.')
print('Done.')

Generating account rows: 100%|██████████| 1000000/1000000 [10:11<00:00, 1634.40record/s]


Account DataFrame Completed
Saving to account.csv
CSV file saved successfully.
Done.


In [10]:
account_df = pd.read_csv('./data/account.csv')
account_df.head()

Unnamed: 0,id,username,password,name,email,status,created_at,updated_at
0,1354,j0NJM,HgQkH^NT0C,Phan Thục Quyên,quyen.phan@icloud.com,active,2024-05-10,2024-09-14
1,846,GPh6Zj3,9kcjDQzsksZm,Dương Việt Phương,phuong.duong@gdscptit.dev,active,2024-12-29,2025-01-26
2,1353,MpbIOL,%UDxEl@#,Phan Uyên Thơ,thophan294@outlook.com,active,2024-08-28,2025-03-18
3,845,ktwBL,iI9Rt5nHz,Võ Thường Xuân,xuan685@stu.ptit.edu.vn,banned,2025-01-04,2025-01-14
4,1352,d8SgoQFa,upFnD^A@Ja@R,Trương Hồng Khanh,khanhtruong@ptit.edu.vn,active,2023-12-03,2023-12-03


customer(id, account_id, phone_number, address)

manager(id, role_id, account_id)


In [11]:
sampled_ids = account_df.sample(n=10**6, replace=False).index.tolist()

manager_ids = sampled_ids[:200]
customer_ids = sampled_ids[200:]

In [12]:
role_df = pd.read_csv('./data/role.csv', encoding='utf-8')
role_df

Unnamed: 0,id,name,status
0,1,admin,active
1,2,product_manager,active
2,3,service_customer,active


In [13]:
def generate_customer(id, account_id, address_df):
    
    phone = generate_phone_number()
    address = generate_random_address(address_df)

    return {
        'id': id,
        'account_id': account_id,
        'phone_number': phone,
        'address': address,
    }

def generate_manager(id, account_id):
    return {
        'id': id,
        'role_id': role_df.sample(1).index[0],
        'account_id': account_id
    }

In [14]:
def generate_customers(address_df, max_workers=5):
    customers = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(generate_customer, i, account_id, address_df) for i, account_id in enumerate(customer_ids)]
        
        for f in tqdm(as_completed(futures), total=len(customer_ids), desc="Generate customer data", unit="record"):
            result = f.result()
            if result:
                customers.append(result)
                
    return pd.DataFrame(customers)

def generate_managers(max_workers=5):
    managers = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(generate_manager, i, account_id) for i, account_id in enumerate(manager_ids)]
        
        for f in tqdm(as_completed(futures), total=len(manager_ids), desc="Generate customer data", unit="record"):
            result = f.result()
            if result:
                managers.append(result)
                
    return pd.DataFrame(managers)


In [15]:
print('Fetching address data...')
address_df = fetch_addresses()

Fetching address data...


  df = pd.read_sql("SELECT * FROM address", conn)


In [16]:
address_df

Unnamed: 0,id,wards,districts,provinces
0,0,Phường Quang Trung,Thành phố Hà Giang,Tỉnh Hà Giang
1,1,Phường Trần Phú,Thành phố Hà Giang,Tỉnh Hà Giang
2,2,Phường Ngọc Hà,Thành phố Hà Giang,Tỉnh Hà Giang
3,3,Phường Nguyễn Trãi,Thành phố Hà Giang,Tỉnh Hà Giang
4,4,Phường Minh Khai,Thành phố Hà Giang,Tỉnh Hà Giang
...,...,...,...,...
10030,10030,Xã Thổ Châu,Thành phố Phú Quốc,Tỉnh Kiên Giang
10031,10031,Xã Hòn Tre,Huyện Kiên Hải,Tỉnh Kiên Giang
10032,10032,Xã Lại Sơn,Huyện Kiên Hải,Tỉnh Kiên Giang
10033,10033,Xã An Sơn,Huyện Kiên Hải,Tỉnh Kiên Giang


In [17]:
print('Generate customer data...')
customer_df = generate_customers(address_df=address_df, max_workers=max_thread)

Generate customer data...


Generate customer data: 100%|██████████| 999800/999800 [17:39<00:00, 943.69record/s]  


In [18]:
customer_df

Unnamed: 0,id,account_id,phone_number,address
0,18736,193672,0378034764,"Xã Hòa Bình, Huyện Trà Ôn, Tỉnh Vĩnh Long"
1,18735,402080,0916578252,"Xã Dân Lực, Huyện Triệu Sơn, Tỉnh Thanh Hóa"
2,18734,735632,0884272165,"Xã Tân Kỳ, Huyện Tứ Kỳ, Tỉnh Hải Dương"
3,18733,207896,0894017140,"Xã Vĩnh Hòa Hiệp, Huyện Châu Thành, Tỉnh Kiên ..."
4,18732,660820,0823531839,"Phường Bình Khánh, Thành phố Long Xuyên, Tỉnh ..."
...,...,...,...,...
999795,999796,862205,0949525910,"Xã Nga Thái, Huyện Nga Sơn, Tỉnh Thanh Hóa"
999796,999797,566884,0378096235,"Xã Hy Cương, Thành phố Việt Trì, Tỉnh Phú Thọ"
999797,999798,722650,0824754489,"Xã Linh Thông, Huyện Định Hóa, Tỉnh Thái Nguyên"
999798,999793,947834,0985604153,"Xã Định Thành A, Huyện Đông Hải, Tỉnh Bạc Liêu"


In [19]:
print('Generate manager data...')
manager_df = generate_managers(max_workers=max_thread)
print('Customer DataFrame Completed')

Generate manager data...


Generate customer data: 100%|██████████| 200/200 [00:00<00:00, 42302.61record/s]

Customer DataFrame Completed



