In [143]:
import pandas as pd
import numpy as np
import random

In [144]:
users = pd.DataFrame(
    columns=[
        "id",
        "createdAt",
        "birthDate",
        "city",
        "state",
        "country",
        "firstAccountType",
        "timeToOnboarding",
        "timeToFirstTransfer",
        "lastLoginDate",
    ]
)

accounts = pd.DataFrame(
    columns=[
        "id",
        "createdAt",
        "accountType",
        "balance",
        "status",
        "userId",
        "lastActivityDate",
        "lastTransferDate",
    ]
)

transfers = pd.DataFrame(
    columns=[
        "id",
        "amount",
        "accountType",
        "minute",
        "hour",
        "date",
        "createdAt",
        "scheduled",
        "status",
        "fromId",
        "toId",
    ]
)

# Utilizando um dataset para ter todas as cidades do brasil

In [145]:
cidades_brasil = pd.read_csv(
    "http://blog.mds.gov.br/redesuas/wp-content/uploads/2018/06/Lista_Munic%C3%ADpios_com_IBGE_Brasil_Versao_CSV.csv",
    sep=";",
    encoding="latin1",
)

cidades_brasil.drop(
    columns=["ConcatUF+Mun", "IBGE", "IBGE7", "Unnamed: 9"], inplace=True
)

populacao_total = cidades_brasil["População 2010"].sum()
cidades_brasil["Porcentagem da População"] = (
    cidades_brasil["População 2010"] / populacao_total
)
cidades_brasil.query("Município == 'Rancharia'")

Unnamed: 0,UF,Município,Região,População 2010,Porte,Capital,Porcentagem da População
3740,SP,Rancharia,Região Sudeste,28804.0,Pequeno II,,0.000151


In [146]:
def get_random_city():
    return (cidades_brasil.sample(1).iloc[0][["Município", "UF"]]).values

users[["city", "state"]] = [get_random_city() for _ in range(10000)]
users["country"] = "Brasil"

In [147]:
def gen_random_date():
    day = random.randint(1, 31)
    month = random.randint(1, 12)
    year = random.randint(1950, 2000)
    if month == 2 and day > 28:
        day = 28
    if month in [4, 6, 9, 11] and day > 30:
        day = 30
    if month == 2 and day == 29:
        if year % 4 != 0 or (year % 100 == 0 and year % 400 != 0):
            day = 28
    birth_date = pd.Timestamp(
        year=year, month=month, day=day, hour=0, minute=0, second=0
    )
    return birth_date 

In [148]:
users["birthDate"] = [gen_random_date() for _ in range(10000)]

In [149]:
def choose_account_type():
    weights = [0.6, 0.4]
    choice = random.choices(["Conta Corrente", "Conta Poupança"], weights=weights)
    return choice[0]

In [150]:
users["firstAccountType"] = [choose_account_type() for _ in range(10000)]

In [151]:
def gen_random_time_to_onboarding():
    return random.randint(60, 360)

In [152]:
users["timeToOnboarding"] = [gen_random_time_to_onboarding() for _ in range(10000)]

In [153]:
def gen_random_time_to_first_transfer():
    return random.randint(120, 1.21e6)

In [154]:
users["timeToFirstTransfer"] = [gen_random_time_to_first_transfer() for _ in range(10000)]

In [155]:
def gen_random_created_at():
    year = random.randint(2020, 2023)
    month = random.randint(1, 12)
    day = random.randint(1, 31)
    if month == 2 and day > 28:
        day = 28
    if month in [4, 6, 9, 11] and day > 30:
        day = 30
    if month == 2 and day == 29:
        if year % 4 != 0 or (year % 100 == 0 and year % 400 != 0):
            day = 28
    hour = random.randint(0, 23)
    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    created_at = pd.Timestamp(
        year=year,
        month=month,
        day=day,
        hour=hour,
        minute=minute,
        second=second,
    )
    return created_at

In [156]:
users["createdAt"] = [gen_random_created_at() for _ in range(10000)]

In [157]:
def gen_random_date_after(date):
    year = random.randint(date.year, 2023)
    month = random.randint(1, 12)
    day = random.randint(1, 31)
    if month == 2 and day > 28:
        day = 28
    if month in [4, 6, 9, 11] and day > 30:
        day = 30
    if month == 2 and day == 29:
        if year % 4 != 0 or (year % 100 == 0 and year % 400 != 0):
            day = 28
    hour = random.randint(0, 23)
    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    return pd.Timestamp(
        year=year,
        month=month,
        day=day,
        hour=hour,
        minute=minute,
        second=second,
    )

In [158]:
for i, user in users.iterrows():
    created_at = user["createdAt"]
    users.at[i, "lastLoginDate"] = gen_random_date_after(created_at)

In [159]:
users['id'] = range(1, 10001)

In [160]:
users.head()

Unnamed: 0,id,createdAt,birthDate,city,state,country,firstAccountType,timeToOnboarding,timeToFirstTransfer,lastLoginDate
0,1,2022-07-27 02:47:30,1970-06-16,Borda da Mata,MG,Brasil,Conta Corrente,272,129298,2022-09-20 08:13:39
1,2,2022-08-04 12:18:01,1999-09-06,Araguatins,TO,Brasil,Conta Corrente,146,519033,2022-11-08 09:43:30
2,3,2022-12-01 18:17:28,1971-03-29,Cariré,CE,Brasil,Conta Corrente,108,1054012,2022-05-29 23:50:42
3,4,2022-01-13 03:38:20,1959-08-05,Monção,MA,Brasil,Conta Poupança,63,379139,2023-09-06 17:07:54
4,5,2020-08-19 08:31:16,1998-02-08,Coronel Ezequiel,RN,Brasil,Conta Poupança,203,315439,2021-01-07 14:04:07


In [161]:
users.to_csv("users.csv", mode='w', index=False)

# Mocando contas

In [162]:
def gen_random_balance():
    return random.randint(100, 100000)

In [163]:
accounts['balance'] = [gen_random_balance() for _ in range(30000)]

In [164]:
accounts['accountType'] = [choose_account_type() for _ in range(30000)]

In [165]:
accounts['id'] = range(1, 30001)

In [166]:
accounts['userId'] = [random.randint(1, 10000) for _ in range(30000)]

In [167]:
def gen_random_status():
    weights = [0.96, 0.04]
    choice = random.choices(["Ativa", "Inativa"], weights=weights)
    return choice[0]

In [168]:
accounts['status'] = [gen_random_status() for _ in range(30000)]

In [169]:
accounts['createdAt'] = [gen_random_created_at() for _ in range(30000)]

In [170]:
for i, account in accounts.iterrows():
    created_at = account['createdAt']
    accounts.at[i, 'lastActivityDate'] = gen_random_date_after(created_at)
    last_transfer_date = gen_random_date_after(created_at)
    accounts.at[i, 'lastTransferDate'] = gen_random_date_after(last_transfer_date)

In [172]:
accounts.to_csv("accounts.csv", mode='w', index=False)