In [1]:
import numpy as np
import pandas as pd

from faker import Faker
from uuid import uuid4

fake = Faker("id_ID")

In [2]:
talents = pd.read_csv('../talents_cleaned.csv')

In [3]:
talents.head()

Unnamed: 0,talent_id,tags,talent_description
0,e52cd4d5-04cf-47df-bc6c-166ed47ca27f,Kpopers|Olahraga|Berjiwa petualang|Badminton|H...,"Saya sangat menyukai Kpop, girlband favorit sa..."
1,2f2ae1c2-8a7e-4d3b-9267-6518b48a2c8e,Seniman|Desain|Kreatif|Inovatif|Optimis,Saya adalah seorang seniman visual yang mencin...
2,7a9c6f0b-5bea-45e1-ae7f-f7211f6025a1,Kuliner|Traveling|Petualangan|Bersepeda|Kutu buku,Pecinta kuliner dan petualangan! Saya suka men...
3,9b17344b-df71-43f3-8dd7-b9c7c5cdcb44,Teknologi|Programmer|Inovatif|Analitis|Coding,Seorang pengembang perangkat lunak yang bersem...
4,4e2a575a-c8e8-4a1d-a2ad-42e2d1fc7e5d,Pengajar|Membaca|Ambisius|Visioner|Cerdas,Pengajar bahasa Inggris yang juga pencinta lit...


In [4]:
location_df = pd.read_csv("../data/location.csv", index_col="Unnamed: 0")

In [5]:
location_df.head()

Unnamed: 0,Provinsi,Kota/Kabupaten,Lokasi
0,Aceh,Banda Aceh,"Aceh, Banda Aceh"
1,Aceh,Langsa,"Aceh, Langsa"
2,Aceh,Lhokseumawe,"Aceh, Lhokseumawe"
3,Aceh,Sabang,"Aceh, Sabang"
4,Aceh,Subulussalam,"Aceh, Subulussalam"


In [6]:
list_lokasi = location_df["Lokasi"].values

## Impute missing columns

In [7]:
gender_choice = ['L', 'P']

In [8]:
name_df = pd.read_csv("../data/e_msmhs_1.csv", sep=";")

In [9]:
boys = name_df[name_df["GENDER"] == "L"]
girls = name_df[name_df["GENDER"] == "P"]

In [10]:
boys_name = []
for fullname in boys["FULLNAME"].str.split().to_numpy():
    for n in fullname:
        boys_name.append(n)

In [11]:
girls_name = []
for fullname in girls["FULLNAME"].str.split().to_numpy():
    for n in fullname:
        girls_name.append(n)

In [12]:
def generate_random_identity(minimum_age: int = 16, maximum_age: int = 45):
    girl = np.random.randint(2)

    if not girl:
        gender = gender_choice[girl]
        fullname = []
        for _ in range(np.random.randint(2, 5)):
            name = np.random.choice(boys_name)
            while name in fullname:
                name = np.random.choice(boys_name)
            fullname.append(name.capitalize())
        birth_date = fake.date_of_birth(
            minimum_age=minimum_age, maximum_age=maximum_age
        )
        age = 2023 - birth_date.year

        return (
            " ".join(fullname),
            gender,
            f"{birth_date.day}/{birth_date.month}/{birth_date.year}",
            age,
        )
    else:
        gender = gender_choice[girl]
        fullname = []
        for _ in range(np.random.randint(2, 5)):
            name = np.random.choice(girls_name)
            while name in fullname:
                name = np.random.choice(girls_name)
            fullname.append(name.capitalize())
        birth_date = fake.date_of_birth(
            minimum_age=minimum_age, maximum_age=maximum_age
        )
        age = 2023 - birth_date.year

        return (
            " ".join(fullname),
            gender,
            f"{birth_date.day}/{birth_date.month}/{birth_date.year}",
            age,
        )

In [13]:
columns = [
    "talent_name",
    "talent_gender",
    "talent_birth_date",
    "talent_age",
    "talent_location",
]

for col in columns:
    talents[col] = pd.Series([np.nan for _ in range(len(talents))])

In [14]:
talents.head()

Unnamed: 0,talent_id,tags,talent_description,talent_name,talent_gender,talent_birth_date,talent_age,talent_location
0,e52cd4d5-04cf-47df-bc6c-166ed47ca27f,Kpopers|Olahraga|Berjiwa petualang|Badminton|H...,"Saya sangat menyukai Kpop, girlband favorit sa...",,,,,
1,2f2ae1c2-8a7e-4d3b-9267-6518b48a2c8e,Seniman|Desain|Kreatif|Inovatif|Optimis,Saya adalah seorang seniman visual yang mencin...,,,,,
2,7a9c6f0b-5bea-45e1-ae7f-f7211f6025a1,Kuliner|Traveling|Petualangan|Bersepeda|Kutu buku,Pecinta kuliner dan petualangan! Saya suka men...,,,,,
3,9b17344b-df71-43f3-8dd7-b9c7c5cdcb44,Teknologi|Programmer|Inovatif|Analitis|Coding,Seorang pengembang perangkat lunak yang bersem...,,,,,
4,4e2a575a-c8e8-4a1d-a2ad-42e2d1fc7e5d,Pengajar|Membaca|Ambisius|Visioner|Cerdas,Pengajar bahasa Inggris yang juga pencinta lit...,,,,,


In [15]:
for i in range(len(talents)):
    id = uuid4()
    name, gender, birth_date, age = generate_random_identity()
    location = np.random.choice(list_lokasi)
    values = {
        "talent_name": name,
        "talent_gender": gender,
        "talent_birth_date": birth_date,
        "talent_age": age,
        "talent_location": location,
    }

    talents.loc[i, "talent_id"] = id

    for col in columns:
        if pd.isna(talents.loc[i, col]):
            talents.loc[i, col] = values[col]

In [16]:
talents["talent_age"] = talents["talent_age"].astype(np.int64)

In [17]:
talents = talents[
    [
        "talent_id",
        "talent_name",
        "talent_gender",
        "talent_birth_date",
        "talent_age",
        "talent_location",
        "tags",
        "talent_description",
    ]
]

In [18]:
talents.columns = [
    "talent_id",
    "talent_name",
    "talent_gender",
    "talent_birth_date",
    "talent_age",
    "talent_location",
    "talent_tags",
    "talent_description",
]

In [19]:
talents

Unnamed: 0,talent_id,talent_name,talent_gender,talent_birth_date,talent_age,talent_location,talent_tags,talent_description
0,fc0f6e4b-c397-40a4-a24c-d08b92aedc45,Pribadi Anwar Wicaksono,L,24/4/1992,31,"Banten, Cilegon",Kpopers|Olahraga|Berjiwa petualang|Badminton|H...,"Saya sangat menyukai Kpop, girlband favorit sa..."
1,674c14d8-ed13-471a-aabf-ceef4e5bc44b,Perdana Anshari,L,10/3/2005,18,"Kalimantan Utara, Tarakan",Seniman|Desain|Kreatif|Inovatif|Optimis,Saya adalah seorang seniman visual yang mencin...
2,f2427a83-eb72-47dd-aeaf-cf131b5cefa2,Dwi Nur Septasya Nahda,P,10/3/2002,21,"Sumatera Selatan, Pagaralam",Kuliner|Traveling|Petualangan|Bersepeda|Kutu buku,Pecinta kuliner dan petualangan! Saya suka men...
3,ac56a04d-aa98-4591-8e8c-2b188040c114,Asmara Tamara Liranda,P,31/3/1992,31,"Sumatera Barat, Lima Puluh Kota",Teknologi|Programmer|Inovatif|Analitis|Coding,Seorang pengembang perangkat lunak yang bersem...
4,d7a31753-7f7a-4d3a-80ec-bca26e578735,Adisyafitri Oky Septasya,P,12/7/1990,33,"Kalimantan Timur, Samarinda",Pengajar|Membaca|Ambisius|Visioner|Cerdas,Pengajar bahasa Inggris yang juga pencinta lit...
...,...,...,...,...,...,...,...,...
504,61601bd4-79b7-4399-b6b3-4bad407c2e06,Iqbal Syahnur Muhammad Septian,L,22/1/1999,24,"Maluku Utara, Tidore Kepulauan",Fashion|Model|Stylist|Perfeksionis|Kreatif,Seorang model yang juga memiliki keahlian seba...
505,7fe44c69-7944-4e56-a529-f6168a102f8c,Andri Alya'a Faustina Dewi,P,29/11/1989,34,"DKI Jakarta, Administrasi Jakarta Utara",Teknologi|Programmer|Coding|Cerdas|Kreatif,Seorang pengembang backend dengan fokus pada m...
506,c24fd25f-5784-4c9c-9703-b0076c83c110,Riansyah Arya Herdianto Priatna,L,30/11/1993,30,"Nusa Tenggara Barat, Mataram",Olahraga|Pengajar|Ramah|Ambisius|Petualangan,Seorang pemain golf berpengalaman yang juga me...
507,82be1c14-f2dd-43ed-87c4-bcb1e787be3d,Umar Ade Jeremy Sholeh,L,6/4/1991,32,"Sulawesi Selatan, Parepare",Seni|Ilustrator|Kreatif|Seniman|Ramah,Seorang ilustrator dan komikus yang menciptaka...


In [20]:
talents.to_csv("../data/talents.csv", index=False)

In [21]:
talents['talent_age'].value_counts()

36    25
33    24
21    23
30    23
22    22
38    22
28    21
45    20
40    19
42    19
39    18
26    18
20    18
27    17
32    17
44    17
18    17
37    17
31    16
41    16
24    15
16    14
25    14
29    12
43    12
17    11
23    11
35    11
34    10
19     9
46     1
Name: talent_age, dtype: int64