In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 50)

import os
# Use this to see how much memory the dataframes use
from sys import getsizeof

import yaml
import re

import pseudopeople as psp
from pseudopeople.configuration import get_configuration
from vivarium.framework.randomness import RandomnessStream
from vivarium.config_tree import ConfigTree

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import datatypes, alpha, data_loading
from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data


!date
!whoami
!uname -a
!pwd

Wed 03 May 2023 03:47:25 PM PDT
ndbs
Linux int-slurm-sarchive-p0002 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


In [2]:
%load_ext autoreload
%autoreload 2

# Test functions for zero noise configuration

In [3]:
config = psp.get_config()
config.keys()

dict_keys(['decennial_census', 'american_community_survey', 'current_population_survey', 'women_infants_and_children', 'social_security', 'taxes_w2_and_1099'])

In [5]:
zero_config = alpha.get_zero_noise_config()
zero_config

{'decennial_census': {'row_noise': {'omit_row': {'row_probability': 0}},
  'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0},
    'use_fake_name': {'cell_probability': 0},
    'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
   'middle_initial': {'leave_blank': {'cell_probability': 0},
    'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
   'last_name': {'leave_blank': {'cell_probability': 0},
    'use_fake_name': {'cell_probability': 0},
    'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
   'age': {'leave_blank': {'cell_probability': 0},
    'misreport_age': {'cell_probability': 0,
     'possible_age_differences': {-2: 0.1, -1: 0.4, 1: 0.4, 2: 0.1}},
    'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
   'date_of_birth': {'leave_blank': {'cell_probability': 0},
    'write_wrong_digits': {'cell_probability': 0, 'token_probability': 0.1},
    'make_typos': {'cell_probability': 0, 'token_probabili

In [6]:
census = psp.generate_decennial_census(config=zero_config)
census

                                                                                                     

Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity
0,0_2,Melanie,L,Herrod,26,08/05/1993,10233,north burgher avenue,,Anytown,US,00000,Reference person,Female,White
1,0_3,Jordan,C,Herrod,26,12/29/1993,10233,north burgher avenue,,Anytown,US,00000,Other relative,Female,White
2,0_923,John,E,Mckeever,77,06/29/1942,147-153,browning ave,,Anytown,US,00000,Reference person,Male,Black
3,0_2641,Sharon,T,Schmidt,59,10/10/1960,107,stallion st,,Anytown,US,00000,Reference person,Female,White
4,0_2801,Ronnie,A,Arthur,73,12/05/1946,214,s vine lane,,Anytown,US,00000,Reference person,Male,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10404,0_19008,James,G,Halsey,56,06/12/1963,1113,times square blvd,,Anytown,US,00000,Reference person,Male,Black
10405,0_20161,Nannette,D,Hoffman,61,11/09/1958,4123,nw 13th ave,no 207r,Anytown,US,00000,Reference person,Female,White
10406,0_20162,Cynthia,L,Hoffman,65,01/20/1955,4123,nw 13th ave,no 207r,Anytown,US,00000,Same-sex spouse,Female,White
10407,0_19669,Anthony,B,Cowan,59,10/06/1960,84101,inkberry drive,,Anytown,US,00000,Reference person,Male,Black


In [7]:
census.isna().sum()

simulant_id                        0
first_name                         0
middle_initial                     0
last_name                          0
age                                0
date_of_birth                      0
street_number                    438
street_name                        0
unit_number                     9857
city                               0
state                              0
zipcode                            0
relation_to_reference_person       0
sex                                0
race_ethnicity                     0
dtype: int64

In [8]:
zero_config2 = psp.get_config()
alpha.recursive_zero(zero_config2)
zero_config2

{'decennial_census': {'row_noise': {'omit_row': {'row_probability': 0.0}},
  'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.0},
    'use_fake_name': {'cell_probability': 0.0},
    'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
   'middle_initial': {'leave_blank': {'cell_probability': 0.0},
    'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
   'last_name': {'leave_blank': {'cell_probability': 0.0},
    'use_fake_name': {'cell_probability': 0.0},
    'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
   'age': {'leave_blank': {'cell_probability': 0.0},
    'misreport_age': {'cell_probability': 0.0,
     'possible_age_differences': {-2: 0.1, -1: 0.4, 1: 0.4, 2: 0.1}},
    'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
   'date_of_birth': {'leave_blank': {'cell_probability': 0.0},
    'write_wrong_digits': {'cell_probability': 0.0, 'token_probability': 0.0},
    'make_typos': {'cell_proba

# Test code for merging categories

In [18]:
s = pd.Series([1, 10, 5, '10', np.nan, 9, 10, '1', np.nan])
s

0      1
1     10
2      5
3     10
4    NaN
5      9
6     10
7      1
8    NaN
dtype: object

In [19]:
s_cat = s.astype('category')
s_cat

0      1
1     10
2      5
3     10
4    NaN
5      9
6     10
7      1
8    NaN
dtype: category
Categories (6, object): [1, 5, 9, 10, '1', '10']

In [20]:
cat_map = dict(zip(s_cat.cat.categories, s_cat.cat.categories.astype(str)))
cat_map

{1: '1', 5: '5', 9: '9', 10: '10', '1': '1', '10': '10'}

In [63]:
# def cat_func(cat):
#     return str(cat)

cat_func = str

list(map(cat_func, s_cat.cat.categories))

['1', '5', '9', '10', '1', '10']

In [59]:
def merge_series_categories(series, category_mapping):
    # https://stackoverflow.com/questions/32262982/pandas-combining-multiple-categories-into-one
    return series.map(category_mapping).astype('category')

def merge_categories1(categorical: pd.Categorical, old_cat_to_new_cat: dict):
    # Oops, old_cat_to_new_cat doesn't need to be a dict -- it can be a function or Series too
    new_cat_array = categorical.categories.map(old_cat_to_new_cat)
    new_cats = new_cat_array.unique()
    if len(new_cats) == len(new_cat_array):
        # one-to-one mapping -> no merging is necessary, just renaming of categories
        # Note: Index.unique() returns values in order of appearance, not sorted
        new_categorical = categorical.rename_categories(new_cats)
    else:
        # Map each new category to its index in the categories array, i.e., its code
        new_cat_to_new_code = dict(zip(new_cats, range(len(new_cats))))
        # This array replaces each old category with the index (code) of the new category
        new_code_array = new_cat_array.map(new_cat_to_new_code)
        # The index (code) of the old category is mapped to the index (code) of the new category
        old_code_to_new_code = dict(zip(range(len(new_code_array)), new_code_array))
        # -1 indicates NaN and needs to stay the same in the new codes
        old_code_to_new_code.update({-1: -1})
        new_codes = categorical.codes.map(old_code_to_new_code)
        new_categorical = pd.Categorical.from_codes(new_codes, new_cats)
    return new_categorical

def merge_categories2(categorical: pd.Categorical, old_cat_to_new_cat):
    new_cat_array = categorical.categories.map(old_cat_to_new_cat)
    new_cats = new_cat_array.unique()
    if len(new_cats) == len(new_cat_array):
        # one-to-one mapping -> no merging is necessary, just renaming of categories
        # Note: Index.unique() returns values in order of appearance, not sorted,
        # therefore is guaranteed to equal new_cat_array
        new_categorical = categorical.rename_categories(new_cats)
    else:
        # Map each new category to its index in the categories array, i.e., its code
        new_cat_to_new_code = dict(zip(new_cats, range(len(new_cats))))
        # This array replaces each old category with the index (code) of the new category
        new_code_array = new_cat_array.map(new_cat_to_new_code)
        # The index (code) of the old category is mapped to the index (code) of the new category
        new_codes = categorical.codes.map(lambda old_code: new_code_array[old_code] if old_code != -1 else -1)
        new_categorical = pd.Categorical.from_codes(new_codes, new_cats)
    return new_categorical

def merge_categories3(categorical: pd.Categorical, old_cat_to_new_cat):
    new_cat_array = categorical.categories.map(old_cat_to_new_cat)
    new_cats = new_cat_array.unique()
    if len(new_cats) == len(new_cat_array):
        # one-to-one mapping -> no merging is necessary, just renaming of categories
        # Note: Index.unique() returns values in order of appearance, not sorted,
        # therefore is guaranteed to equal new_cat_array
        new_categorical = categorical.rename_categories(new_cats)
    else:
        # Map each new category to its index in the categories array, i.e., its code
        new_cat_to_new_code = dict(zip(new_cats, range(len(new_cats))))
        # This array replaces each old category with the index (code) of the new category
        new_code_array = new_cat_array.map(new_cat_to_new_code).to_list()
        # -1 indicates NaN and needs to stay the same in the new codes
        new_code_array.append(-1)
        # The index (code) of the old category is mapped to the index (code) of the new category
        new_codes = categorical.codes.map(lambda old_code: new_code_array[old_code])
        new_categorical = pd.Categorical.from_codes(new_codes, new_cats)
    return new_categorical

%time merge_categories1(s_cat.cat, cat_map)

CPU times: user 5.13 ms, sys: 162 µs, total: 5.29 ms
Wall time: 5.18 ms


['1', '10', '5', '10', NaN, '9', '10', '1', NaN]
Categories (4, object): ['1', '5', '9', '10']

In [31]:
%time merge_categories1(s_cat.cat, cat_func)

CPU times: user 4.05 ms, sys: 769 µs, total: 4.82 ms
Wall time: 4.67 ms


['1', '10', '5', '10', NaN, '9', '10', '1', NaN]
Categories (4, object): ['1', '5', '9', '10']

In [48]:
s_cat.cat.categories

Index([1, 5, 9, 10, '1', '10'], dtype='object')

In [26]:
s_cat.cat.categories.unique()

Index([1, 5, 9, 10, '1', '10'], dtype='object')

In [27]:
new_cat_array = s_cat.cat.categories.map(cat_func)
new_cat_array

Index(['1', '5', '9', '10', '1', '10'], dtype='object')

In [28]:
new_cat_array.unique()

Index(['1', '5', '9', '10'], dtype='object')

In [55]:
temp = new_cat_array.to_list()
temp

['1', '5', '9', '10', '1', '10']

In [56]:
temp.append(-1)
temp

['1', '5', '9', '10', '1', '10', -1]

In [58]:
new_cat_array.append(pd.Index([-1]))

Index(['1', '5', '9', '10', '1', '10', -1], dtype='object')

In [32]:
%timeit merge_categories1(s_cat.cat, cat_map)
%timeit merge_categories1(s_cat.cat, cat_func)

1.19 ms ± 3.17 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
949 µs ± 2.48 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [33]:
%timeit merge_categories2(s_cat.cat, cat_map)
%timeit merge_categories2(s_cat.cat, cat_func)

949 µs ± 4.39 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
685 µs ± 1.43 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [34]:
merge_categories1(s_cat.cat, cat_func).equals(merge_categories2(s_cat.cat, cat_func))

True

# Test with a long Series

In [35]:
size = 1_000_000
rng = np.random.default_rng(99887)
a = rng.choice([1, 12, 3, 4, 4], size=size)
b = rng.choice(['1', '1', '12', '3'], size=size)
ab = pd.concat([pd.Series(a), pd.Series(b)], ignore_index=True).astype('category')
ab

0           4
1           4
2          12
3           4
4           1
           ..
1999995     1
1999996     1
1999997     1
1999998     1
1999999     3
Length: 2000000, dtype: category
Categories (7, object): [1, 3, 4, 12, '1', '12', '3']

In [36]:
ab.value_counts()

1     499691
4     399736
3     250291
12    250018
12    200469
3     199909
1     199886
dtype: int64

In [60]:
ab_str_s = merge_series_categories(ab, cat_func)
ab_str1 = pd.Series(merge_categories1(ab.cat, cat_func))
ab_str2 = pd.Series(merge_categories2(ab.cat, cat_func))
ab_str3 = pd.Series(merge_categories3(ab.cat, cat_func))
for series in [ab_str1, ab_str2, ab_str3]:
    print(series.equals(ab_str_s))

True
True
True


In [64]:
%timeit merge_series_categories(ab, cat_func)
%timeit pd.Series(merge_categories1(ab.cat, cat_func))
%timeit pd.Series(merge_categories2(ab.cat, cat_func))
%timeit pd.Series(merge_categories3(ab.cat, cat_func))

97.3 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
17.3 ms ± 18.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.6 s ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
431 ms ± 517 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [65]:
4600/17.3

265.8959537572254

In [40]:
%time pd.Series(merge_categories2(ab.cat, cat_func))

CPU times: user 4.65 s, sys: 39.7 ms, total: 4.69 s
Wall time: 4.68 s


0           4
1           4
2          12
3           4
4           1
           ..
1999995     1
1999996     1
1999997     1
1999998     1
1999999     3
Length: 2000000, dtype: category
Categories (4, object): ['1', '3', '4', '12']

In [42]:
d = dict(zip(ab.cat.categories, ab.cat.categories.astype(str)))
d

{1: '1', 3: '3', 4: '4', 12: '12', '1': '1', '12': '12', '3': '3'}

In [43]:
%time pd.Series(merge_categories2(ab.cat, d))

CPU times: user 4.77 s, sys: 60.8 ms, total: 4.83 s
Wall time: 4.83 s


0           4
1           4
2          12
3           4
4           1
           ..
1999995     1
1999996     1
1999997     1
1999998     1
1999999     3
Length: 2000000, dtype: category
Categories (4, object): ['1', '3', '4', '12']

In [44]:
%time pd.Series(merge_categories1(ab.cat, cat_func))

CPU times: user 34.5 ms, sys: 263 µs, total: 34.7 ms
Wall time: 32.4 ms


0           4
1           4
2          12
3           4
4           1
           ..
1999995     1
1999996     1
1999997     1
1999998     1
1999999     3
Length: 2000000, dtype: category
Categories (4, object): ['1', '3', '4', '12']

In [45]:
%time pd.Series(merge_categories1(ab.cat, d))

CPU times: user 35.2 ms, sys: 0 ns, total: 35.2 ms
Wall time: 32.9 ms


0           4
1           4
2          12
3           4
4           1
           ..
1999995     1
1999996     1
1999997     1
1999998     1
1999999     3
Length: 2000000, dtype: category
Categories (4, object): ['1', '3', '4', '12']

# Test `merge_categories` function after copying to module

### First make sure we get the same Series as above

In [66]:
ab_str_m = pd.Series(datatypes.merge_categories(ab.cat, cat_func))
ab_str_m.equals(ab_str_s)

True

In [67]:
%timeit pd.Series(datatypes.merge_categories(ab.cat, cat_func))

16.9 ms ± 26.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [69]:
s_cat

0      1
1     10
2      5
3     10
4    NaN
5      9
6     10
7      1
8    NaN
dtype: category
Categories (6, object): [1, 5, 9, 10, '1', '10']

In [68]:
datatypes.merge_categories(s_cat.cat, cat_func)

['1', '10', '5', '10', NaN, '9', '10', '1', NaN]
Categories (4, object): ['1', '5', '9', '10']

### Now make sure it works with a one-to-one function

In [70]:
def double(x):
    return 2*x

datatypes.merge_categories(s_cat.cat, double)

0       2
1      20
2      10
3    1010
4     NaN
5      18
6      20
7      11
8     NaN
dtype: category
Categories (6, object): [2, 10, 18, 20, '11', '1010']

In [74]:
s_cat.map(double).equals(datatypes.merge_categories(s_cat.cat, double))

True

In [76]:
ab.map(double)

0           8
1           8
2          24
3           8
4           2
           ..
1999995    11
1999996    11
1999997    11
1999998    11
1999999    33
Length: 2000000, dtype: category
Categories (7, object): [2, 6, 8, 24, '11', '1212', '33']

In [77]:
ab.map(double).equals(datatypes.merge_categories(ab.cat, double))

True

In [78]:
%timeit datatypes.merge_categories(ab.cat, double)

543 µs ± 601 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Now test `convert_category_dtype` function after refactoring

In [80]:
pd.Categorical(s_cat)

[1, 10, 5, '10', NaN, 9, 10, '1', NaN]
Categories (6, object): [1, 5, 9, 10, '1', '10']

In [83]:
pd.Categorical(s)

[1, 10, 5, '10', NaN, 9, 10, '1', NaN]
Categories (6, object): [1, 5, 9, 10, '1', '10']

In [84]:
df = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7,8,9]})
df

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [85]:
df.dtypes

a    int64
b    int64
c    int64
dtype: object

In [86]:
df['c'] = pd.Categorical(df['c'])
df.dtypes

a       int64
b       int64
c    category
dtype: object

In [87]:
df

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [90]:
df.c

0    7
1    8
2    9
Name: c, dtype: category
Categories (3, int64): [7, 8, 9]

In [91]:
datatypes.convert_category_dtype(df, str)
df.c

0    7
1    8
2    9
Name: c, dtype: category
Categories (3, object): ['7', '8', '9']

In [92]:
df.dtypes

a       int64
b       int64
c    category
dtype: object

In [93]:
df

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [96]:
df['d'] = pd.Categorical([1,2,'1'])
df.d

0    1
1    2
2    1
Name: d, dtype: category
Categories (3, object): [1, 2, '1']

In [97]:
datatypes.convert_category_dtype(df, str)
df.d

0    1
1    2
2    1
Name: d, dtype: category
Categories (2, object): ['1', '2']

In [98]:
df.dtypes

a       int64
b       int64
c    category
d    category
dtype: object

In [99]:
df

Unnamed: 0,a,b,c,d
0,1,4,7,1
1,2,5,8,2
2,3,6,9,1


# Test the functions when there are more categories

In [100]:
int(3.5)

3

In [103]:
%%time
rng = np.random.default_rng(456456)
size = 1_000_000
frac = 1/3
num = int(size*frac)
ints = range(num)
strs = map(str, ints)
remaining = rng.choice(['1', '456', '789', 1, 456, 789, 789], size=size-2*num)
many_cats = pd.concat([pd.Series(ints), pd.Series(strs), pd.Series(remaining)], ignore_index=True).astype('category')
many_cats

CPU times: user 931 ms, sys: 39.1 ms, total: 970 ms
Wall time: 967 ms


0           0
1           1
2           2
3           3
4           4
         ... 
999995    789
999996    456
999997    456
999998    789
999999    456
Length: 1000000, dtype: category
Categories (666666, object): [0, 1, 2, 3, ..., '99996', '99997', '99998', '99999']

In [104]:
%%time
many_cats_str_s = merge_series_categories(many_cats, cat_func)
many_cats_str1 = pd.Series(merge_categories1(many_cats.cat, cat_func))
many_cats_str2 = pd.Series(merge_categories2(many_cats.cat, cat_func))
many_cats_str3 = pd.Series(merge_categories3(many_cats.cat, cat_func))
for series in [many_cats_str1, many_cats_str2, many_cats_str3]:
    print(series.equals(many_cats_str_s))

True
True
True
CPU times: user 5.64 s, sys: 175 ms, total: 5.81 s
Wall time: 5.81 s


In [105]:
%timeit merge_series_categories(many_cats, cat_func)
%timeit pd.Series(merge_categories1(many_cats.cat, cat_func))
%timeit pd.Series(merge_categories2(many_cats.cat, cat_func))
%timeit pd.Series(merge_categories3(many_cats.cat, cat_func))

540 ms ± 686 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
996 ms ± 3.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.93 s ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
868 ms ± 718 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [106]:
sizemb(many_cats)

53.797325

In [107]:
sizemb(pd.concat([pd.Series(ints), pd.Series(strs), pd.Series(remaining)], ignore_index=True))

31.808854

# Write functions to check runtime and memory usage for a range of numbers of categories

In [109]:
4/2, 5//2

(2.0, 2)

In [112]:
temp = rng.choice([1,2,3], 10)
temp

array([2, 1, 1, 3, 1, 2, 1, 1, 3, 1])

In [113]:
temp.astype(str)

array(['2', '1', '1', '3', '1', '2', '1', '1', '3', '1'], dtype='<U21')

In [None]:
rng.choice()

In [121]:
%%time
def get_series(cat_frac, size=1_000_000, random_state=None):
    rng = np.random.default_rng(random_state)
    num_repeated = 20 # could pass this as a parameter, but it shouldn't be very important
    num_cats = int(size*cat_frac)
    if num_cats % 2:
        num_cats -= 1
    ints = range(num_cats//2)
    strs = map(str, ints)
    repeated = rng.choice(ints, size=num_repeated)
    repeated = [*repeated, *repeated.astype(str)]
    remaining = rng.choice(repeated, size=size-num_cats)
    series = pd.concat(
        [pd.Series(ints), pd.Series(strs), pd.Series(remaining)],
        ignore_index=True)
    return series

seed=456456
t = get_series(1/4, random_state=seed)
t

CPU times: user 131 ms, sys: 44.2 ms, total: 175 ms
Wall time: 173 ms


0              0
1              1
2              2
3              3
4              4
           ...  
999995    119278
999996     62213
999997    119682
999998    119682
999999     13572
Length: 1000000, dtype: object

In [122]:
t_cat = t.astype('category')
t_cat

0              0
1              1
2              2
3              3
4              4
           ...  
999995    119278
999996     62213
999997    119682
999998    119682
999999     13572
Length: 1000000, dtype: category
Categories (250000, object): [0, 1, 2, 3, ..., '99996', '99997', '99998', '99999']

In [123]:
def measure_stuff(series, cat_func):
    cat_series = series.astype('category')
    print("Series:", sizemb(series), "MB")
    print("Categ :", sizemb(cat_series), "MB")
    %timeit merge_series_categories(cat_series, cat_func)
    %timeit pd.Series(merge_categories1(cat_series.cat, cat_func))
    %timeit pd.Series(merge_categories2(cat_series.cat, cat_func))
    %timeit pd.Series(merge_categories3(cat_series.cat, cat_func))

measure_stuff(t, cat_func)

Series: 58.951222 MB
Categ : 24.718214 MB
202 ms ± 403 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
306 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.56 s ± 8.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
430 ms ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [124]:
seed = 890890
for cat_frac in [1/100, 1/10, 1/4, 1/3]:
    print("Cat fraction:", cat_frac)
    temp = get_series(cat_frac, random_state=seed)
    measure_stuff(temp, cat_func)
    print()

Cat fraction: 0.01
Series: 60.57726 MB
Categ : 2.748262 MB
69.4 ms ± 398 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
17.4 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.37 s ± 8.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
246 ms ± 952 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

Cat fraction: 0.1
Series: 60.41923 MB
Categ : 11.002606 MB
106 ms ± 381 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
119 ms ± 260 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.43 s ± 6.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
303 ms ± 719 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

Cat fraction: 0.25
Series: 58.763883 MB
Categ : 24.718214 MB
204 ms ± 1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
314 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.54 s ± 4.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
435 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [125]:
seed = 890890
for cat_frac in [1/2, 2/3]:
    print("Cat fraction:", cat_frac)
    temp = get_series(cat_frac, random_state=seed)
    measure_stuff(temp, cat_func)
    print()

Cat fraction: 0.5
Series: 55.888951 MB
Categ : 45.547358 MB
417 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
733 ms ± 2.76 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.79 s ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
682 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Cat fraction: 0.6666666666666666
Series: 53.789261 MB
Categ : 53.797325 MB
535 ms ± 867 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
993 ms ± 3.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.96 s ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
859 ms ± 3.92 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

