In [547]:
import sys
import os
import pandas as pd
import numpy as np
from bisect import bisect_left
from dataclasses import dataclass
import datetime
import geopy.distance
import json
from io import BytesIO

PYADPS_PATH = os.path.dirname(os.getcwd())
sys.path.append(PYADPS_PATH)

WORLDCITIES_PATH = PYADPS_PATH + '/pyadps/static_files/worldcities/worldcities.csv'
FIRST_NAMES_PATH = PYADPS_PATH + '/pyadps/static_files/name_databases/all.txt'

SEED = 12345
rng = np.random.default_rng(SEED)

In [463]:
from pyadps.mail import Mail, CoordsData, FileAttachment
from pyadps.helpers import calculate_hashsum

In [3]:
df = pd.read_csv(WORLDCITIES_PATH)
df

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37977000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,34540000.0,1360771077
2,Delhi,Delhi,28.6600,77.2300,India,IN,IND,Delhi,admin,29617000.0,1356872604
3,Mumbai,Mumbai,18.9667,72.8333,India,IN,IND,Mahārāshtra,admin,23355000.0,1356226629
4,Manila,Manila,14.6000,120.9833,Philippines,PH,PHL,Manila,primary,23088000.0,1608618140
...,...,...,...,...,...,...,...,...,...,...,...
40996,Tukchi,Tukchi,57.3670,139.5000,Russia,RU,RUS,Khabarovskiy Kray,,10.0,1643472801
40997,Numto,Numto,63.6667,71.3333,Russia,RU,RUS,Khanty-Mansiyskiy Avtonomnyy Okrug-Yugra,,10.0,1643985006
40998,Nord,Nord,81.7166,-17.8000,Greenland,GL,GRL,Sermersooq,,10.0,1304217709
40999,Timmiarmiut,Timmiarmiut,62.5333,-42.2167,Greenland,GL,GRL,Kujalleq,,10.0,1304206491


In [4]:
target_country_code_iso2 = 'RU'
target_country_part = 0.8
rest_part = 1 - target_country_part

mail_number = 50000
target_country_mail_number = round(target_country_part * mail_number)
other_countries_mail_number = round(rest_part * mail_number)

target_country_df = df.loc[df['iso2'] == target_country_code_iso2]
other_countries_df = df.loc[df['iso2'] != target_country_code_iso2]

target_country_population = target_country_df['population'].sum()
other_countries_population = other_countries_df['population'].sum()

In [5]:
target_country_df

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
14,Moscow,Moscow,55.7558,37.6178,Russia,RU,RUS,Moskva,primary,17125000.0,1643318494
131,Saint Petersburg,Saint Petersburg,59.9500,30.3167,Russia,RU,RUS,Sankt-Peterburg,admin,5351935.0,1643616350
466,Novosibirsk,Novosibirsk,55.0333,82.9167,Russia,RU,RUS,Novosibirskaya Oblast’,admin,1602915.0,1643399240
505,Yekaterinburg,Yekaterinburg,56.8356,60.6128,Russia,RU,RUS,Sverdlovskaya Oblast’,admin,1468833.0,1643582706
570,Nizhniy Novgorod,Nizhniy Novgorod,56.3269,44.0075,Russia,RU,RUS,Nizhegorodskaya Oblast’,admin,1264075.0,1643012126
...,...,...,...,...,...,...,...,...,...,...,...
40994,Starorybnoye,Starorybnoye,72.7666,104.8000,Russia,RU,RUS,Krasnoyarskiy Kray,,10.0,1643724242
40995,Agapa,Agapa,71.4504,89.2500,Russia,RU,RUS,Krasnoyarskiy Kray,,10.0,1643009087
40996,Tukchi,Tukchi,57.3670,139.5000,Russia,RU,RUS,Khabarovskiy Kray,,10.0,1643472801
40997,Numto,Numto,63.6667,71.3333,Russia,RU,RUS,Khanty-Mansiyskiy Avtonomnyy Okrug-Yugra,,10.0,1643985006


In [6]:
print(target_country_population, other_countries_population)

108733755.0 4391117396.440001


In [7]:
def get_cumulative_list(df_):
    population_list = df_['population'].tolist()
    
    cumulative_list = []
    s = 0
    for population_entry in population_list:
        population_value = population_entry if not pd.isna(population_entry) else 0
        cumulative_list.append(population_value + s)
        s += population_value
    
    return cumulative_list

target_country_cumulative_list = get_cumulative_list(target_country_df)
other_countries_cumulative_list = get_cumulative_list(other_countries_df)

In [8]:
target_country_cumulative_list

[17125000.0,
 22476935.0,
 24079850.0,
 25548683.0,
 26812758.0,
 28056258.0,
 29258629.0,
 30437020.0,
 31606739.0,
 32732038.0,
 33847598.0,
 34931463.0,
 35979012.0,
 37027017.0,
 38042603.0,
 38924079.0,
 39769379.0,
 40513933.0,
 41221341.0,
 41867618.0,
 42500919.0,
 43125437.0,
 43749173.0,
 44365415.0,
 44973494.0,
 45580083.0,
 46173059.0,
 46745799.0,
 47310242.0,
 47867162.0,
 48419267.0,
 48956889.0,
 49489393.0,
 50019190.0,
 50542916.0,
 51053355.0,
 51554823.0,
 52044321.0,
 52529542.0,
 53004598.0,
 53455369.0,
 53904432.0,
 54338009.0,
 54769931.0,
 55189294.0,
 55607535.0,
 56019059.0,
 56425992.0,
 56832545.0,
 57223680.0,
 57584270.0,
 57940438.0,
 58296131.0,
 58647619.0,
 58994707.0,
 59336599.0,
 59666452.0,
 59992507.0,
 60314549.0,
 60633405.0,
 60946417.0,
 61261206.0,
 61572831.0,
 61879809.0,
 62187720.0,
 62487380.0,
 62785476.0,
 63075841.0,
 63367528.0,
 63647761.0,
 63926312.0,
 64203960.0,
 64481628.0,
 64752402.0,
 65019077.0,
 65284239.0,
 65549229.0,

In [296]:
@dataclass
class City:
    name: str
    country_code: str
    coords: CoordsData



def choose_city_by_random_value(
    value: float, 
    target_df,
) -> City:
    """value is float from 0 to 1"""
    population = target_df['population'].sum()
    cumulative_list = get_cumulative_list(target_df)
    
    people_num = value*population
    city_idx = bisect_left(cumulative_list, people_num)
    city_row = target_df[city_idx:city_idx+1]
    
    return City(
        name=city_row['city_ascii'].tolist()[0], 
        country_code=city_row['iso2'].tolist()[0], 
        coords=CoordsData(city_row['lat'].tolist()[0], city_row['lng'].tolist()[0]))


In [10]:
people_num = 0.5*target_country_population
city_idx = bisect_left(target_country_cumulative_list, people_num)
city_row = target_country_df[city_idx:city_idx+1]

In [11]:
city_row['city_ascii'].tolist()

['Ulan-Ude']

In [12]:
choose_city_by_random_value(rng.random(), df)

City(name='Yantai', country_code='CN', coords=CoordsData(lat=37.3997, lon=121.2664))

In [13]:
target_country_cities = [
    choose_city_by_random_value(rng.random(), target_country_df) 
    for _ in range(target_country_mail_number)
]

other_countries_cities = [
    choose_city_by_random_value(rng.random(), other_countries_df) 
    for _ in range(other_countries_mail_number)
]

In [14]:
print(len(target_country_cities), len(other_countries_cities))

40000 10000


In [15]:
other_countries_cities

[City(name='Ho Chi Minh City', country_code='VN', coords=CoordsData(lat=10.8167, lon=106.6333)),
 City(name='Zhanjiang', country_code='CN', coords=CoordsData(lat=21.1967, lon=110.4031)),
 City(name='Bouar', country_code='CF', coords=CoordsData(lat=5.95, lon=15.6)),
 City(name='Perth', country_code='AU', coords=CoordsData(lat=-31.9522, lon=115.8589)),
 City(name='Zhaoqing', country_code='CN', coords=CoordsData(lat=23.05, lon=112.4667)),
 City(name='Lahore', country_code='PK', coords=CoordsData(lat=31.5497, lon=74.3436)),
 City(name='Songnam', country_code='KR', coords=CoordsData(lat=37.4386, lon=127.1378)),
 City(name='Dazhou', country_code='CN', coords=CoordsData(lat=31.2152, lon=107.4947)),
 City(name='Huaihua', country_code='CN', coords=CoordsData(lat=27.5494, lon=109.9592)),
 City(name='Huanggang', country_code='CN', coords=CoordsData(lat=30.45, lon=114.875)),
 City(name='Padang', country_code='ID', coords=CoordsData(lat=-0.9556, lon=100.3606)),
 City(name='Miami', country_code='US'

In [16]:
def get_file_sizes_bytes(number_of_files: int, sum_bytes: int) -> list:
    sizes_rnd = [rng.random() for _ in range(number_of_files)]
    sizes_cum_sum = np.cumsum(sizes_rnd)
    rate_coef = sum_bytes / sizes_cum_sum[-1]
    boundaries_byte_numbers = [round(rate_coef * el) for el in sizes_cum_sum]
    return boundaries_byte_numbers

In [17]:
get_file_sizes_bytes(100000, 1 * 1024 * 1024 * 1024)

[3876,
 5941,
 9070,
 16400,
 33536,
 35133,
 44505,
 46711,
 62006,
 64295,
 72249,
 85127,
 100495,
 104922,
 124415,
 139591,
 148600,
 158647,
 169910,
 177943,
 180406,
 197528,
 213762,
 233509,
 236361,
 245549,
 253390,
 261685,
 273553,
 284540,
 291446,
 305029,
 324530,
 340908,
 341896,
 344295,
 356325,
 367208,
 370007,
 382987,
 386196,
 396250,
 402583,
 404075,
 413274,
 425421,
 435732,
 444494,
 451986,
 470265,
 472748,
 487473,
 506021,
 512070,
 513927,
 531817,
 546726,
 551946,
 554990,
 558933,
 570583,
 590845,
 593652,
 596423,
 604454,
 613143,
 615236,
 627943,
 640944,
 647145,
 650324,
 654109,
 664292,
 684245,
 690369,
 711828,
 718828,
 722017,
 732274,
 745607,
 765942,
 767279,
 777622,
 789569,
 806434,
 821735,
 828035,
 840632,
 858063,
 862662,
 882724,
 885999,
 896107,
 915037,
 918979,
 926862,
 931991,
 937429,
 955422,
 971479,
 989860,
 993417,
 1008348,
 1009250,
 1022557,
 1022799,
 1036008,
 1037710,
 1043585,
 1061405,
 1081960,
 110114

In [18]:
big_files_sizes = [1024**3, *([512*1024**2]*2), *([256*1024**2]*4), *([128*1024**2]*8)]
short_files_sizes = list(np.diff([0, *get_file_sizes_bytes(100000 - len(big_files_sizes), 1 * 1024 * 1024 * 1024)]))

In [19]:
def choose_attachments_for_mail(big_attachments: list, short_attachments: list) -> list:
    random_value = rng.random()
    if 0 <= random_value < 0.05:
        return []
    elif 0.05 <= random_value < 0.10:
        return [rng.choice(big_attachments)]
    elif 0.10 <= random_value < 0.12:
        return list(rng.choice(big_attachments, 2))
    elif 0.12 <= random_value < 0.14:
        return [*list(rng.choice(big_attachments, 2)), rng.choice(short_attachments)]
    elif 0.14 <= random_value < 0.15:
        return [*list(rng.choice(big_attachments, 2)), *list(rng.choice(short_attachments, 2))]
    elif 0.15 <= random_value < 0.20:
        return [rng.choice(big_attachments), rng.choice(short_attachments)]
    elif 0.20 <= random_value < 0.90:
        return [rng.choice(short_attachments)]
    elif 0.90 <= random_value < 0.95:
        return list(rng.choice(short_attachments, 2))
    elif 0.95 <= random_value < 0.97:
        return list(rng.choice(short_attachments, 3))
    elif 0.97 <= random_value < 0.99:
        return list(rng.choice(short_attachments, 4))
    else:
        return list(rng.choice(short_attachments, 5))

In [20]:
choose_attachments_for_mail(big_files_sizes, short_files_sizes)

[12655]

In [387]:
print(len(big_files_sizes), len(short_files_sizes))

15 99985


In [21]:
def generate_random_datetime(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + datetime.timedelta(
        # Get a random amount of seconds between `start` and `end`
        seconds=int(rng.integers(0, int((end - start).total_seconds()))),
    )

DATETIME_FROM = datetime.datetime(2021, 1, 1)
DATETIME_TO = datetime.datetime(2022, 1, 1)

In [22]:
generate_random_datetime(DATETIME_FROM, DATETIME_TO)

datetime.datetime(2021, 4, 13, 0, 4, 3)

In [31]:
names_df = pd.read_csv(FIRST_NAMES_PATH, header=None)
all_names = list(names_df[0])

In [38]:
def generate_name(all_names_: list) -> str:
    random_value = rng.random()
    if 0 <= random_value < 0.6:
        return f'{rng.choice(all_names_).lower()}@{rng.choice(all_names_).lower()}.com'
    elif 0.6 <= random_value < 0.95:
        return '+' + ''.join([str(rng.integers(0, 10)) for _ in range(10)])
    else:
        return rng.choice(all_names_) + ' ' + rng.choice(all_names_)

In [220]:
generate_name(all_names)

'valeriy@louazna.com'

In [222]:
def generate_additional_notes(all_names_: list):
    random_value = rng.random()
    if 0 <= random_value < 0.8:
        return rng.choice(all_names_) + ' ' + rng.choice(all_names_)
    else:
        return None

In [268]:
generate_additional_notes(all_names)

'Abenaura Humbert'

In [270]:
def generate_inline_message(all_names_: list):
    random_value = rng.random()
    if 0 <= random_value < 0.8:
        return ' '.join(rng.choice(all_names_).lower() for _ in range(rng.integers(1, 40)))
    else:
        return None

In [288]:
generate_inline_message(all_names)

'fernando cristian samoil modou nely zelia hien soad gancho luka marielle danica ciprian deseado neco matus xiaoling orfelina dorthea victorias xumiao popa natashia bernabea asahel elenora jagoba anke tomasz jordi odessa'

In [295]:
top_100_cities_df = df.sort_values(by=['population'], ascending=False).head(100)
top_100_cities_df

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37977000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,34540000.0,1360771077
2,Delhi,Delhi,28.6600,77.2300,India,IN,IND,Delhi,admin,29617000.0,1356872604
3,Mumbai,Mumbai,18.9667,72.8333,India,IN,IND,Mahārāshtra,admin,23355000.0,1356226629
4,Manila,Manila,14.6000,120.9833,Philippines,PH,PHL,Manila,primary,23088000.0,1608618140
...,...,...,...,...,...,...,...,...,...,...,...
95,Kunming,Kunming,25.0433,102.7061,China,CN,CHN,Yunnan,admin,6250000.0,1156477539
96,Nanchong,Nanchong,30.7991,106.0784,China,CN,CHN,Sichuan,minor,6183000.0,1156762337
97,Zunyi,Zunyi,27.7050,106.9336,China,CN,CHN,Guizhou,,6127009.0,1156539782
99,Lu’an,Lu'an,31.7542,116.5078,China,CN,CHN,Anhui,minor,6090000.0,1156499624


In [333]:
def add_error_to_coords(lat: float, lon: float, min_error_meters=100, max_error_meters=20_000) -> tuple:
    bearing = rng.random() * 360
    distance = rng.random() * (max_error_meters - min_error_meters) + min_error_meters
    
    geodesic = geopy.distance.geodesic()
    lat_, lon_, _ = geodesic.destination((lat, lon), bearing, distance / 1000)
    return lat_, lon_
    

def get_list_of_cities(cities, top_100_cities_df_):
    result = []
    for city in cities:
        random_value = rng.random()
        if 0 <= random_value < 0.9:
            additional_cities = 0
        elif 0.9 <= random_value < 0.95:
            additional_cities = 1
        elif 0.95 <= random_value < 0.98:
            additional_cities = 2
        else:
            additional_cities = 3
        
        filtered_cities = [city]
        for _ in range(additional_cities):
            filtered_cities.append(choose_city_by_random_value(rng.random(), top_100_cities_df))
        
        coords_list = []
        for filtered_city in filtered_cities:
            coords_list.append((filtered_city.coords.lat, filtered_city.coords.lon))
        
        result.append(coords_list)
    
    return result

In [334]:
add_error_to_coords(other_countries_cities[0].coords.lat, other_countries_cities[0].coords.lon)

(10.787182970792903, 106.60824751298813)

In [340]:
list_of_cities = get_list_of_cities([*target_country_cities, *other_countries_cities], top_100_cities_df)

In [342]:
list_of_cities[:15]

[[(56.0167, 92.8667)],
 [(50.9167, 128.4833)],
 [(54.9, 52.3)],
 [(54.3167, 48.3667)],
 [(58.0139, 56.2489)],
 [(43.4833, 43.6167), (38.3037, 116.8452), (17.3667, 78.4667)],
 [(59.95, 30.3167)],
 [(55.8, 38.45)],
 [(54.7667, 20.6)],
 [(55.7908, 49.1144)],
 [(54.95, 20.4833)],
 [(56.4, 61.9333)],
 [(55.7558, 37.6178)],
 [(53.75, 87.1167)],
 [(58.05, 65.2667)]]

In [343]:
len(list_of_cities)

50000

In [464]:
def create_attachment(adps_attachments_path_: str, size_bytes: int) -> FileAttachment:
    tmp_file_path = adps_attachments_path_ + '/tmp.bin'
    with open(tmp_file_path, 'wb') as file_:
        file_.write(rng.bytes(size_bytes))
    
    with open(tmp_file_path, 'rb') as file_:
        hashsum_result = calculate_hashsum(file_)
    
    target_filename = hashsum_result.hex_digest[:10] + '.bin'
    target_file_path = adps_attachments_path_ + '/' + target_filename
    os.rename(tmp_file_path, target_file_path)
    
    return FileAttachment(target_filename, hashsum_result.size_bytes, hashsum_result.hex_digest)
    

In [467]:
# rng.integers(256)
# (55).to_bytes(1, 'big')

ADPS_FOLDER = PYADPS_PATH + '/tmp'
if not os.path.exists(ADPS_FOLDER):
    os.mkdir(ADPS_FOLDER)

adps_messages_path = ADPS_FOLDER + '/adps_messages'
adps_attachments_path = ADPS_FOLDER + '/adps_attachments'

os.mkdir(adps_messages_path)
os.mkdir(adps_attachments_path)

In [468]:
big_attachments = [create_attachment(adps_attachments_path, file_size_bytes) 
                   for file_size_bytes in big_files_sizes]
small_attachments = [create_attachment(adps_attachments_path, file_size_bytes) 
                     for file_size_bytes in short_files_sizes]

In [480]:
l = [rng.random() for _ in range(1000000)]

In [483]:
#attachment = create_attachment(PYADPS_PATH + '/tmp', 40000)
#attachment
attachments_lists = [choose_attachments_for_mail(big_attachments, small_attachments)
                    for _ in range(mail_number)]
attachments_lists[:15]

[[FileAttachment(filename='21a99a033a.bin', size_bytes=2735, hashsum_hex='21a99a033a37a23a7a95d3b48c2733320b1e8a6431a5b9628d174dabd19cdd183419eda9e4ee4c9ed5f74507b0784a5c75b5a492dee8eff0e5993ece1638979b', hashsum_alg='sha512')],
 [FileAttachment(filename='e6175d93c6.bin', size_bytes=11424, hashsum_hex='e6175d93c65518e96ca3a977a3bf5a1c0c127d5e3877f304e890b01e436b46fa320798b7b189674521ac5200981800ecb84af65cca0995ed8494f328c3bfd2bb', hashsum_alg='sha512')],
 [FileAttachment(filename='8d8aec8312.bin', size_bytes=9318, hashsum_hex='8d8aec8312fd812e5a7f627aedf2f93a7722e3a01e59d683c4613a4cc625bbd0541918ffca47926ecce1b2f226e963502aea4c12867ccf5c8ebafddcccbf374a', hashsum_alg='sha512')],
 [FileAttachment(filename='6fd9720661.bin', size_bytes=19831, hashsum_hex='6fd9720661e72c004549f081bbc2a8749f09225508a6098b55aac35e3037a4d318c9bdc33cd27443749043dc0dcc573b6a15b20b6fabc0b499de542d81a26377', hashsum_alg='sha512')],
 [],
 [FileAttachment(filename='c2fcfa18b1.bin', size_bytes=10359, hashsum_hex='c2

In [484]:
len(attachments_lists)

50000

In [493]:
[generate_random_datetime(DATETIME_FROM, DATETIME_TO) for _ in range(mail_number)]

[datetime.datetime(2021, 8, 9, 17, 4, 9),
 datetime.datetime(2021, 9, 13, 11, 9, 13),
 datetime.datetime(2021, 9, 12, 9, 15, 43),
 datetime.datetime(2021, 6, 8, 22, 8, 20),
 datetime.datetime(2021, 2, 23, 8, 39, 21),
 datetime.datetime(2021, 11, 9, 17, 37, 19),
 datetime.datetime(2021, 9, 5, 18, 31, 3),
 datetime.datetime(2021, 8, 3, 7, 5, 42),
 datetime.datetime(2021, 1, 6, 5, 43, 52),
 datetime.datetime(2021, 12, 27, 3, 28, 33),
 datetime.datetime(2021, 6, 16, 6, 48, 26),
 datetime.datetime(2021, 8, 5, 18, 25, 6),
 datetime.datetime(2021, 12, 28, 17, 24, 10),
 datetime.datetime(2021, 5, 11, 17, 19, 31),
 datetime.datetime(2021, 6, 1, 4, 55, 29),
 datetime.datetime(2021, 7, 5, 9, 8, 18),
 datetime.datetime(2021, 12, 7, 21, 50, 42),
 datetime.datetime(2021, 8, 29, 17, 32, 42),
 datetime.datetime(2021, 12, 19, 8, 38, 2),
 datetime.datetime(2021, 8, 26, 17, 19, 30),
 datetime.datetime(2021, 9, 15, 17, 46, 3),
 datetime.datetime(2021, 5, 29, 2, 23, 3),
 datetime.datetime(2021, 4, 25, 1, 4

In [540]:
list_of_names = [generate_name(all_names) for _ in range(mail_number)]
list_of_additional_notes = [generate_additional_notes(all_names) for _ in range(mail_number)]
list_of_inline_messages = [generate_inline_message(all_names) for _ in range(mail_number)]
list_of_date_created = [generate_random_datetime(DATETIME_FROM, DATETIME_TO) for _ in range(mail_number)]

for l in [list_of_date_created, list_of_cities, list_of_names, list_of_additional_notes,
         list_of_inline_messages, attachments_lists]:
    print(len(l))

50000
50000
50000
50000
50000
50000


In [539]:
[generate_inline_message(all_names) for _ in range(1000)]

['arsenio fala gurdeep',
 None,
 'uraitz ezzahrae mae abderahman kora satwinder emelia shandi yoro',
 None,
 'todor cristian levent abdelhanin nicodemus estanis estanisla bonny muhammad jitka ahcene mozella shahzad revaz shafaqat',
 'haizea emilienne umme oriana roksana yaxuan eutiquiano haritz shamira mirco bernadette',
 'deyan brain hegoa',
 None,
 'sofka',
 'subhadra hades therese zhiguo suzy olavo davinia mounya leonisa armelina merle salomon fayssal ilargi petria alcazar liga gabriel guarda claudinei',
 None,
 'arlen balbina ouasima nayim edmund khouloud',
 'khalifa banesa una ivelisse rode alaitz pabla dwayne rahul oier orion athanasios yaru niceforo branimir eliodora ariam embarec sashko waltraut abdelfatah setti anyi antima carey somiya doltza fortunata edey',
 'sari masako kabir daylos espiridion takeshi costin leoncia clarita ivanca irene agustina gerhard morad lyndsey tomeu gheorge ellis cherno fahima juanma encarna',
 'geraldin idris',
 None,
 'yaisa xiumei anai',
 None,
 '

In [537]:
%%time
x = [rng.choice(all_names, 2) for _ in range(1000)]

CPU times: user 16.3 ms, sys: 14 µs, total: 16.3 ms
Wall time: 15.4 ms


In [509]:
%%time
x = [all_names[rnd_val] for rnd_val in [rng.integers(0, len(all_names)) for _ in range(1000)]]

CPU times: user 9.29 ms, sys: 12 µs, total: 9.3 ms
Wall time: 8.13 ms


In [507]:
# https://stackoverflow.com/questions/18622781/why-is-numpy-random-choice-so-slow
class CustomRng:
    def __init__(self, np_rng):
        self._rng = np_rng
    
    def integers(self, *args, **kwargs):
        return self._rng.integers(*args, **kwargs)
    
    def bytes(self, *args, **kwargs):
        return self._rng.bytes(*args, **kwargs)
    
    def random(self, *args, **kwargs):
        return self._rng.random(*args, **kwargs)
    
    def choice(self, l: list, *args):
        threshold = 100
        if len(l) < 100:
            return self._rng.choice(l, *args)
        
        count = 1
        if args:
            count = args[0]
        
        indexes = set()
        for _ in range(count):
            attempts = 100
            found_number = False
            for attempt in range(attempts):
                idx = self.integers(0, len(l))
                if idx not in indexes:
                    indexes.add(idx)
                    found_number = True
                    break
                    
            if not found_number:
                raise Exception('Could not build unique set')
        
        if count == 1:
            return l[indexes.pop()]
        else:
            return [l[idx_] for idx_ in indexes]

rng = CustomRng(np.random.default_rng(SEED))

In [549]:
for created_date, coords_list, name, additional_notes, inline_message, attachment_infos in zip(
    list_of_date_created, 
    list_of_cities, 
    list_of_names, 
    list_of_additional_notes,
    list_of_inline_messages,
    attachments_lists
):
    mail = Mail(
        date_created=created_date,
        recipient_coords=[CoordsData(*one_coords) for one_coords in coords_list],
        name=name,
        additional_notes=additional_notes,
        inline_message=inline_message,
        attachments=attachment_infos
    )
    mail_serialized = Mail.Schema().dump(mail)
    mail_json_bytes = json.dumps(mail_serialized, indent=4, sort_keys=True).encode()
    hashsum = calculate_hashsum(BytesIO(mail_json_bytes))
    
    mail_path = adps_messages_path + '/' + hashsum.hex_digest[:10] + '.json'
    with open(mail_path, 'wb') as output_json_file:
        output_json_file.write(mail_json_bytes)