In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 50)

import os
# Use this to see how much memory the dataframes use
from sys import getsizeof

import yaml
import re

import pseudopeople as pp
from pseudopeople.configuration import get_configuration
from vivarium.framework.randomness import RandomnessStream
from vivarium.config_tree import ConfigTree

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import datatypes, alpha, data_loading
from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data


!date
!whoami
!uname -a
!pwd

Fri 14 Apr 2023 02:52:32 PM PDT
ndbs
Linux gen-slurm-sarchive-p0016 5.4.0-135-generic #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
project_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
model_dir = (
    f'{project_dir}/results'
    '/v2.0_three_censuses/united_states_of_america/2023_04_08_14_07_04'
)
parquet_dir = f'{model_dir}/final_results/parquet'
hdf_dir = f'{model_dir}/final_results/hdf'
rhode_island_par_dir = f'{parquet_dir}/states/rhode_island'
usa_par_dir = f'{parquet_dir}/usa'
rhode_island_hdf_dir = f'{hdf_dir}/states/rhode_island'
usa_hdf_dir = f'{hdf_dir}/usa'

!ls -halt $hdf_dir

total 96K
drwxrwsr-x  3 rmudambi IHME-Simulationscience  512 Apr  9 18:49 states
drwxrwsr-x  5 rmudambi IHME-Simulationscience 1.5K Apr  9 18:49 .
drwxrwsr-x 10 rmudambi IHME-Simulationscience 4.0K Apr  9 15:35 usa
drwxrwsr-x  4 rmudambi IHME-Simulationscience 2.0K Apr  9 15:21 ..
drwxrwsr-x  2 rmudambi IHME-Simulationscience 335K Apr  9 11:55 logs


In [5]:
!ls -halt $usa_hdf_dir/social_security_observer

total 8.8G
drwxrwsr-x 10 rmudambi IHME-Simulationscience 4.0K Apr  9 15:35 ..
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 12:22 social_security_observer_5399.hdf
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:22 .
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 11:50 social_security_observer_9847.hdf
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 11:50 social_security_observer_9971.hdf
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 11:49 social_security_observer_9911.hdf
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 11:49 social_security_observer_9888.hdf
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 11:49 social_security_observer_99.hdf
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 11:49 social_security_observer_9772.hdf
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 11:49 social_security_observer_9901.hdf
-rw-r--r--  1 rmudambi IHME-Simulationscience  27M Apr  9 11:49

In [16]:
filename = 'social_security_observer_9871.hdf'
matches = re.findall(r'_(\d+)\.', filename)
matches

['9871']

In [19]:
ext = '.hdf'
matches = re.findall(r'^.*_(\d+)' + ext + '$', filename)
matches

['9871']

In [35]:
ext = '.hdf'
match = re.match(r'^.*_(\d+)' + ext + '$', filename)
match

<re.Match object; span=(0, 33), match='social_security_observer_9871.hdf'>

In [38]:
match.groups()

('9871',)

In [39]:
match.group(0)

'social_security_observer_9871.hdf'

In [40]:
match.group(1)

'9871'

In [41]:
match.group(2)

IndexError: no such group

In [42]:
ext = '.hdf'
match = re.match(fr'^.*_(\d+){ext}$', filename)
match.groups()

('9871',)

In [44]:
match.group(1)

'9871'

In [27]:
%%time
seeds = [93, 9840, 8221]
df_ssa = data_loading.load_shards_and_concatenate(
    f'{usa_hdf_dir}/social_security_observer',
    '.hdf',
    seeds,
    start=0,
    stop=100,
)
df_ssa

CPU times: user 8.56 s, sys: 408 ms, total: 8.97 s
Wall time: 9.08 s


Unnamed: 0,event_type,date_of_birth,ssn,simulant_id,event_date,last_name,first_name,middle_initial
0,creation,1919-12-04,214-56-6837,9840_219390,1919-12-04,Rogers,Marion,M
1,creation,1920-01-30,299-87-0024,9840_494513,1920-01-30,Vodden,Tommy,J
2,creation,1920-02-14,483-38-1957,9840_223673,1920-02-14,Schiffer,Marion,M
3,creation,1920-07-09,887-09-5982,9840_132695,1920-07-09,Gilbert,Michael,N
4,creation,1920-08-16,177-43-1462,9840_987002,1920-08-16,Hari,Ruth,H
...,...,...,...,...,...,...,...,...
295,creation,1921-10-13,885-07-2280,93_192637,1921-10-13,Pardue,Jean,E
296,creation,1921-10-13,017-89-2312,93_793070,1921-10-13,Chandler,John,M
297,creation,1921-10-15,699-09-0125,93_637237,1921-10-15,Jacques,Gertrude,P
298,creation,1921-10-17,063-79-2162,93_986128,1921-10-17,Gray,Thelma,D


In [28]:
!ls -halt $rhode_island_par_dir

total 256K
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 10 11:30 tax_dependents_observer
drwxrwsr-x 9 rmudambi IHME-Simulationscience 3.5K Apr 10 11:10 .
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 10 11:10 tax_1040_observer
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr  9 21:10 tax_w2_observer
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr  9 19:17 wic_observer
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr  9 19:13 household_survey_observer_cps
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr  9 19:10 household_survey_observer_acs
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr  9 19:09 decennial_census_observer
drwxrwsr-x 3 rmudambi IHME-Simulationscience  512 Apr  9 18:48 ..


In [30]:
%%time
seeds = [93, 9840, 8221]
df_wic = data_loading.load_shards_and_concatenate(
    f'{rhode_island_par_dir}/wic_observer',
    '.parquet',
    seeds,
    ignore_index=True,
)
df_wic

CPU times: user 1.12 s, sys: 38.6 ms, total: 1.16 s
Wall time: 1.16 s


Unnamed: 0,age,first_name,race_ethnicity,relation_to_household_head,simulant_id,unit_number,year,guardian_1,street_name,household_id,guardian_2,street_number,middle_initial,city,last_name,zipcode,sex,state,housing_type,date_of_birth
0,26,Katelynn,Latino,Reference person,93_25455,,2019,93_-1,rochester drive,93_10191,93_-1,2121,C,north providence,Salvador,02857,Female,RI,Standard,1992-03-19
1,29,Emily,White,Reference person,93_285143,,2019,93_-1,kingsdale ave,93_114625,93_-1,4505,M,pawtucket,Hitchcock,02865,Female,RI,Standard,1989-03-31
2,25,Brittany,White,Opp-sex spouse,93_442398,,2019,93_-1,e rosemont ln,93_177607,93_-1,14736,A,south kingstown,Doyle,02893,Female,RI,Standard,1993-09-24
3,33,Angela,White,Reference person,93_446556,,2019,93_-1,grove street,93_179324,93_-1,153,L,wst warwick,Moore,02818,Female,RI,Standard,1985-11-27
4,29,Noemi,White,Biological child,93_459409,,2019,93_-1,homer circle,93_184491,93_-1,1345-1353,C,warwick,Schumacher,02864,Female,RI,Standard,1989-03-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3171,0,Samuel,Asian,Grandchild,9840_1389306,,2041,9840_1314628,forest ave,9840_778954,9840_-1,12235,G,barrington,Chang,02915,Male,RI,Standard,2040-10-01
3172,0,Maddie,Latino,Other nonrelative,9840_1391442,,2041,9840_472521,beverly dr,9840_339441,9840_-1,11439,E,lincoln,Pace,02863,Female,RI,Standard,2040-11-08
3173,0,Hunter,White,Biological child,9840_1391863,,2041,9840_1089594,turtle shell dr,9840_236932,9840_-1,6207,Z,east providence,Steiner,02889,Male,RI,Standard,2040-11-29
3174,0,Ameera,Multiracial or Other,Other nonrelative,9840_1393912,flt 5,2041,9840_94970,roosevelt drive,9840_220288,9840_-1,3721,M,tiverton,Osborne,02806,Female,RI,Standard,2041-01-18


In [31]:
%%time
seeds = [93, 9840, 8221]
df_wic = data_loading.load_shards_and_concatenate(
    f'{rhode_island_par_dir}/wic_observer',
    '.parquet',
    seeds,
    ignore_index=False,
)
df_wic

CPU times: user 1.08 s, sys: 74.9 ms, total: 1.15 s
Wall time: 1.15 s


Unnamed: 0,Unnamed: 1,age,first_name,race_ethnicity,relation_to_household_head,simulant_id,unit_number,year,guardian_1,street_name,household_id,guardian_2,street_number,middle_initial,city,last_name,zipcode,sex,state,housing_type,date_of_birth
93,86,26,Katelynn,Latino,Reference person,93_25455,,2019,93_-1,rochester drive,93_10191,93_-1,2121,C,north providence,Salvador,02857,Female,RI,Standard,1992-03-19
93,1084,29,Emily,White,Reference person,93_285143,,2019,93_-1,kingsdale ave,93_114625,93_-1,4505,M,pawtucket,Hitchcock,02865,Female,RI,Standard,1989-03-31
93,1678,25,Brittany,White,Opp-sex spouse,93_442398,,2019,93_-1,e rosemont ln,93_177607,93_-1,14736,A,south kingstown,Doyle,02893,Female,RI,Standard,1993-09-24
93,1701,33,Angela,White,Reference person,93_446556,,2019,93_-1,grove street,93_179324,93_-1,153,L,wst warwick,Moore,02818,Female,RI,Standard,1985-11-27
93,1754,29,Noemi,White,Biological child,93_459409,,2019,93_-1,homer circle,93_184491,93_-1,1345-1353,C,warwick,Schumacher,02864,Female,RI,Standard,1989-03-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9840,340856,0,Samuel,Asian,Grandchild,9840_1389306,,2041,9840_1314628,forest ave,9840_778954,9840_-1,12235,G,barrington,Chang,02915,Male,RI,Standard,2040-10-01
9840,341308,0,Maddie,Latino,Other nonrelative,9840_1391442,,2041,9840_472521,beverly dr,9840_339441,9840_-1,11439,E,lincoln,Pace,02863,Female,RI,Standard,2040-11-08
9840,341468,0,Hunter,White,Biological child,9840_1391863,,2041,9840_1089594,turtle shell dr,9840_236932,9840_-1,6207,Z,east providence,Steiner,02889,Male,RI,Standard,2040-11-29
9840,341900,0,Ameera,Multiracial or Other,Other nonrelative,9840_1393912,flt 5,2041,9840_94970,roosevelt drive,9840_220288,9840_-1,3721,M,tiverton,Osborne,02806,Female,RI,Standard,2041-01-18


In [32]:
%%time
seeds = [93, 9840, 8221]
df_wic_2020 = data_loading.load_shards_and_concatenate(
    f'{rhode_island_par_dir}/wic_observer',
    '.parquet',
    seeds,
    filter_query="year==2020",
)
df_wic_2020

CPU times: user 1.77 s, sys: 41 ms, total: 1.82 s
Wall time: 1.82 s


Unnamed: 0,age,first_name,race_ethnicity,relation_to_household_head,simulant_id,unit_number,year,guardian_1,street_name,household_id,guardian_2,street_number,middle_initial,city,last_name,zipcode,sex,state,housing_type,date_of_birth
0,32,Felicia,Latino,Reference person,93_290266,,2020,93_-1,north c street,93_116659,93_-1,19322,J,cranston,Valles,02903,Female,RI,Standard,1987-03-25
1,35,Natalie,White,Opp-sex partner,93_318116,,2020,93_-1,pleasant street,93_127859,93_-1,12,R,providence,Woody,02886,Female,RI,Standard,1984-10-11
2,25,Emily,White,Reference person,93_368789,,2020,93_-1,w sunset blvd,93_148138,93_-1,3333,P,coventry,Grebe,02903,Female,RI,Standard,1994-11-20
3,33,Nicole,White,Reference person,93_417764,,2020,93_-1,cherokee st,93_167780,93_-1,32558,S,portsmouth,Roberts,02861,Female,RI,Standard,1986-06-22
4,34,Jamie,White,Opp-sex spouse,93_457001,,2020,93_-1,sr 20 w,93_183516,93_-1,225,K,coventry,Campbell,02835,Female,RI,Standard,1985-12-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,0,Kalani,Latino,Biological child,9840_1011274,,2020,9840_182088,lemon ave,9840_73213,9840_-1,,A,warwick,Fuentes,02906,Female,RI,Standard,2019-08-28
132,0,Nathaniel,White,Biological child,9840_1013021,2nd fl aptmnt # 2017,2020,9840_627276,asbury park,9840_251997,9840_-1,1286,D,coventry,Smith,02920,Male,RI,Standard,2019-09-23
133,0,Noah,White,Biological child,9840_1013852,,2020,9840_126644,jason street,9840_50995,9840_-1,21,O,sth kingstown,Dickson,02905,Male,RI,Standard,2019-10-27
134,0,Ares,Latino,Grandchild,9840_1013968,,2020,9840_256240,prospect st,9840_102949,9840_-1,10543,N,smithfield,Davis,02908,Male,RI,Standard,2019-10-27


In [33]:
%%time
seeds = [93, 9840, 8221]
df_wic_2020_filtered = data_loading.load_shards_and_concatenate(
    f'{rhode_island_par_dir}/wic_observer',
    '.parquet',
    seeds,
    filters=[('year', '=', 2020)],
)
df_wic_2020_filtered

CPU times: user 1.76 s, sys: 74.5 ms, total: 1.83 s
Wall time: 1.88 s


Unnamed: 0,age,first_name,race_ethnicity,relation_to_household_head,simulant_id,unit_number,year,guardian_1,street_name,household_id,guardian_2,street_number,middle_initial,city,last_name,zipcode,sex,state,housing_type,date_of_birth
0,32,Felicia,Latino,Reference person,93_290266,,2020,93_-1,north c street,93_116659,93_-1,19322,J,cranston,Valles,02903,Female,RI,Standard,1987-03-25
1,35,Natalie,White,Opp-sex partner,93_318116,,2020,93_-1,pleasant street,93_127859,93_-1,12,R,providence,Woody,02886,Female,RI,Standard,1984-10-11
2,25,Emily,White,Reference person,93_368789,,2020,93_-1,w sunset blvd,93_148138,93_-1,3333,P,coventry,Grebe,02903,Female,RI,Standard,1994-11-20
3,33,Nicole,White,Reference person,93_417764,,2020,93_-1,cherokee st,93_167780,93_-1,32558,S,portsmouth,Roberts,02861,Female,RI,Standard,1986-06-22
4,34,Jamie,White,Opp-sex spouse,93_457001,,2020,93_-1,sr 20 w,93_183516,93_-1,225,K,coventry,Campbell,02835,Female,RI,Standard,1985-12-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,0,Kalani,Latino,Biological child,9840_1011274,,2020,9840_182088,lemon ave,9840_73213,9840_-1,,A,warwick,Fuentes,02906,Female,RI,Standard,2019-08-28
132,0,Nathaniel,White,Biological child,9840_1013021,2nd fl aptmnt # 2017,2020,9840_627276,asbury park,9840_251997,9840_-1,1286,D,coventry,Smith,02920,Male,RI,Standard,2019-09-23
133,0,Noah,White,Biological child,9840_1013852,,2020,9840_126644,jason street,9840_50995,9840_-1,21,O,sth kingstown,Dickson,02905,Male,RI,Standard,2019-10-27
134,0,Ares,Latino,Grandchild,9840_1013968,,2020,9840_256240,prospect st,9840_102949,9840_-1,10543,N,smithfield,Davis,02908,Male,RI,Standard,2019-10-27


In [34]:
df_wic_2020_filtered.equals(df_wic_2020)

True

# I can't find good documentation of the syntax for `where` queries when reading HDF with pandas

Here are the most helpful links I found so far:

- https://pandas.pydata.org/docs/reference/api/pandas.read_hdf.html
- https://pandas.pydata.org/docs/user_guide/io.html#io-hdf5
- https://stackoverflow.com/questions/30483754/pandas-get-specific-rows-from-hdf5-by-index
- https://stackoverflow.com/questions/28754265/pandas-read-hdf-with-where-condition-limitation
- https://pandas.pydata.org/docs/user_guide/cookbook.html#cookbook-hdf

When I try to filter by year by passing `where=["year=2020"]`, I get this:

```
ValueError: The passed where expression: [year=2020]
            contains an invalid variable reference
            all of the variable references must be a reference to
            an axis (e.g. 'index' or 'columns'), or a data_column
            The currently defined references are: index,columns
```
I can filter by index value or column name, but to filter by values in a column, it appears that the allowable columns to filter by have to somehow be pre-saved in the HDF.

In [59]:
%%time
seeds = [93, 9840, 8221]
df_wic_2020_hdf = data_loading.load_shards_and_concatenate(
    f'{rhode_island_hdf_dir}/wic_observer',
    '.hdf',
    seeds,
#     where=["year=2020"], # Fails with ValueError: The passed where expression: [year=2020]
#                          #                        contains an invalid variable reference
    where=["index<5000"],
    ignore_index=False,
)
df_wic_2020_hdf

CPU times: user 2.57 s, sys: 51 ms, total: 2.62 s
Wall time: 3.04 s


Unnamed: 0,Unnamed: 1,city,middle_initial,guardian_2,sex,year,street_name,last_name,zipcode,simulant_id,date_of_birth,age,guardian_1,unit_number,race_ethnicity,first_name,housing_type,street_number,relation_to_household_head,state,household_id
93,86,north providence,C,93_-1,Female,2019,rochester drive,Salvador,2857,93_25455,1992-03-19,26,93_-1,,Latino,Katelynn,Standard,2121,Reference person,RI,93_10191
93,1084,pawtucket,M,93_-1,Female,2019,kingsdale ave,Hitchcock,2865,93_285143,1989-03-31,29,93_-1,,White,Emily,Standard,4505,Reference person,RI,93_114625
93,1678,south kingstown,A,93_-1,Female,2019,e rosemont ln,Doyle,2893,93_442398,1993-09-24,25,93_-1,,White,Brittany,Standard,14736,Opp-sex spouse,RI,93_177607
93,1701,wst warwick,L,93_-1,Female,2019,grove street,Moore,2818,93_446556,1985-11-27,33,93_-1,,White,Angela,Standard,153,Reference person,RI,93_179324
93,1754,warwick,C,93_-1,Female,2019,homer circle,Schumacher,2864,93_459409,1989-03-18,29,93_-1,,White,Noemi,Standard,1345-1353,Biological child,RI,93_184491
93,1939,east providence,A,93_-1,Female,2019,sweet bay dr,Marks,2910,93_514171,1976-03-19,42,93_-1,,White,Rebecca,Standard,17335,Opp-sex spouse,RI,93_206638
93,2007,johnston,C,93_-1,Female,2019,s northern blvd,Nichols,2816,93_534358,1990-03-22,28,93_-1,,White,Jasmine,Standard,3075,Reference person,RI,93_214784
93,2523,hopkinton,A,93_-1,Female,2019,clstr ridge trl,Valladares,2895,93_674341,1990-03-21,28,93_-1,,Latino,Chelsea,Standard,409,Reference person,RI,93_270974
93,2775,providence,N,93_-1,Female,2019,biddle lane,Lopez,2908,93_750831,1992-12-10,26,93_-1,,Latino,Kari,Standard,112,Reference person,RI,93_301757
93,3072,providence,S,93_-1,Female,2019,w marcus rd,Berkey,2910,93_834264,1978-12-27,40,93_-1,,White,Shaneka,Standard,19140,Opp-sex spouse,RI,93_335463


In [62]:
%%time
seeds = [93, 9840, 8221]
data_loading.load_shards_and_concatenate(
    f'{rhode_island_hdf_dir}/wic_observer',
    '.hdf',
    seeds,
    where=["columns = ['city', 'middle_initial', 'date_of_birth']"],
    ignore_index=False,
)

CPU times: user 2.26 s, sys: 9.02 ms, total: 2.27 s
Wall time: 2.65 s


Unnamed: 0,Unnamed: 1,city,middle_initial,date_of_birth
93,86,north providence,C,1992-03-19
93,1084,pawtucket,M,1989-03-31
93,1678,south kingstown,A,1993-09-24
93,1701,wst warwick,L,1985-11-27
93,1754,warwick,C,1989-03-18
...,...,...,...,...
8221,342014,smithfield,K,2040-12-25
8221,342084,providence,C,2041-01-19
8221,342250,pawtucket,N,2041-01-05
8221,342329,warwick,W,2041-01-12


# Test stuff with categoricals

In [63]:
s = pd.Series([1, 2, 3, '2', '1', 'a'])
s

0    1
1    2
2    3
3    2
4    1
5    a
dtype: object

In [65]:
s_cat = s.astype('category')
s_cat

0    1
1    2
2    3
3    2
4    1
5    a
dtype: category
Categories (6, object): [1, 2, 3, '1', '2', 'a']

In [67]:
s_cat.cat.categories.astype(str)

Index(['1', '2', '3', '1', '2', 'a'], dtype='object')

In [71]:
str_cats = s_cat.cat.categories.astype(str)
s_cat.cat.rename_categories(str_cats)

ValueError: Categorical categories must be unique

In [69]:
s_cat.cat.categories.astype(str).unique()

Index(['1', '2', '3', 'a'], dtype='object')

In [72]:
str_cats = s_cat.cat.categories.astype(str).unique()
s_cat.cat.rename_categories(str_cats)

ValueError: new categories need to have the same number of items as the old categories!

In [73]:
str_cats = s_cat.cat.categories.astype(str).unique()
s_cat.cat.set_categories(str_cats)

0    NaN
1    NaN
2    NaN
3      2
4      1
5      a
dtype: category
Categories (4, object): ['1', '2', '3', 'a']

In [74]:
s_cat.cat.categories

Index([1, 2, 3, '1', '2', 'a'], dtype='object')

In [76]:
m = dict(zip(s_cat.cat.categories, s_cat.cat.categories.astype(str)))
m

{1: '1', 2: '2', 3: '3', '1': '1', '2': '2', 'a': 'a'}

In [92]:
# https://stackoverflow.com/questions/32262982/pandas-combining-multiple-categories-into-one
# df.cat.map(m2).astype("category", categories=set(m2.values()))
s_cat.map(m).astype("category")

0    1
1    2
2    3
3    2
4    1
5    a
dtype: category
Categories (4, object): ['1', '2', '3', 'a']

In [93]:
s_cat.astype(str).astype('category')

0    1
1    2
2    3
3    2
4    1
5    a
dtype: category
Categories (4, object): ['1', '2', '3', 'a']

In [94]:
%timeit s_cat.map(m).astype("category")
%timeit s_cat.astype(str).astype('category')

738 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
351 µs ± 3.93 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [105]:
df = pd.DataFrame(s_cat)
datatypes.convert_category_dtype(df, str)
df[0]

('Categorical categories must be unique',)


0    1
1    2
2    3
3    2
4    1
5    a
Name: 0, dtype: category
Categories (4, object): ['1', '2', '3', 'a']

In [108]:
df = pd.DataFrame([1, np.nan, 4, 6, np.nan, 1, 6])
df[0]

0    1.0
1    NaN
2    4.0
3    6.0
4    NaN
5    1.0
6    6.0
Name: 0, dtype: float64

In [114]:
df2 = df.astype('category')
df2[0]

  output = repr(obj)


0    1.0
1    NaN
2    4.0
3    6.0
4    NaN
5    1.0
6    6.0
Name: 0, dtype: category
Categories (3, float64): [1.0, 4.0, 6.0]

In [115]:
datatypes.convert_category_dtype(df2, str)
df2[0]

0    1.0
1    NaN
2    4.0
3    6.0
4    NaN
5    1.0
6    6.0
Name: 0, dtype: category
Categories (3, object): ['1.0', '4.0', '6.0']

In [116]:
# This doesn't handle NaN's the way I would want...
df3 = df.astype(str)
df3[0]

0    1.0
1    nan
2    4.0
3    6.0
4    nan
5    1.0
6    6.0
Name: 0, dtype: object

In [119]:
df2[0].astype(str)

0    1.0
1    nan
2    4.0
3    6.0
4    nan
5    1.0
6    6.0
Name: 0, dtype: object

In [121]:
df = pd.DataFrame([1, np.nan, 6, 4, '6', np.nan, 1, 6, '4'])
df[0]

0      1
1    NaN
2      6
3      4
4      6
5    NaN
6      1
7      6
8      4
Name: 0, dtype: object

In [123]:
df_cat = df.astype('category')
df_cat[0]

0      1
1    NaN
2      6
3      4
4      6
5    NaN
6      1
7      6
8      4
Name: 0, dtype: category
Categories (5, object): [1, 4, 6, '4', '6']

In [125]:
datatypes.convert_category_dtype(df_cat, str)
df_cat[0]

0      1
1    nan
2      6
3      4
4      6
5    nan
6      1
7      6
8      4
Name: 0, dtype: category
Categories (4, object): ['1', '4', '6', 'nan']

In [None]:
# Try fixing NaNs and run again

In [127]:
df_cat = df.astype('category')
datatypes.convert_category_dtype(df_cat, str)
df_cat[0]

0      1
1    NaN
2      6
3      4
4      6
5    NaN
6      1
7      6
8      4
Name: 0, dtype: category
Categories (3, object): ['1', '4', '6']

In [131]:
s_cat

0    1
1    2
2    3
3    2
4    1
5    a
dtype: category
Categories (6, object): [1, 2, 3, '1', '2', 'a']

In [133]:
cat_map = dict(zip(s_cat.cat.categories, s_cat.cat.categories.astype(str)))
cat_map

{1: '1', 2: '2', 3: '3', '1': '1', '2': '2', 'a': 'a'}

In [135]:
old_categories = s_cat.cat.categories
new_categories = old_categories.map(cat_map)
new_categories

Index(['1', '2', '3', '1', '2', 'a'], dtype='object')

In [136]:
old_categories.union(new_categories)

Index([1, 2, 3, '1', '1', '2', '2', '3', 'a'], dtype='object')

In [137]:
old_categories.union(new_categories.unique())

Index([1, 2, 3, '1', '2', '3', 'a'], dtype='object')

In [139]:
with pd.option_context("mode.copy_on_write", True):
    s_new = datatypes.merge_categories(s_cat, cat_map)
s_new

0    1
1    2
2    3
3    2
4    1
5    a
dtype: object

In [140]:
s_cat

0    1
1    2
2    3
3    2
4    1
5    a
dtype: category
Categories (6, object): [1, 2, 3, '1', '2', 'a']