In [1]:
import numpy as np
import pandas as pd

# Validated data

In [2]:
neonate_validated = pd.read_csv('../../smartva/data/validated-neonate.csv')

In [3]:
neonate_validated.head()

Unnamed: 0,sid,site,gs_code34,gs_text34,gs_code46,gs_text46,gs_code55,gs_text55,gs_comorbid1,gs_comorbid2,...,s999928,s999929,s999930,s999931,s999932,s999933,s999934,s999935,s999936,s999937
0,A-21360006,AP,Z37,Stillbirth,Z37,Stillbirth,Z37,Stillbirth,,,...,0,0,0,0,0,0,0,0,0,0
1,A-21360451,AP,Z37,Stillbirth,Z37,Stillbirth,Z37,Stillbirth,,,...,0,0,0,0,0,0,0,0,0,0
2,A-21360452,AP,Z37,Stillbirth,Z37,Stillbirth,Z37,Stillbirth,,,...,0,0,0,0,0,0,0,0,0,0
3,A-21360453,AP,Z37,Stillbirth,Z37,Stillbirth,Z37,Stillbirth,,,...,0,0,0,0,0,0,0,0,0,0
4,A-21360454,AP,Z37,Stillbirth,Z37,Stillbirth,Z37,Stillbirth,,,...,0,0,0,0,0,0,0,0,0,0


In [4]:
counts_by_cause = neonate_validated.gs_text34.value_counts()
counts_by_cause

Stillbirth                 1005
Birth asphyxia              551
Preterm Delivery            461
Congenital malformation     249
Meningitis/Sepsis           242
Pneumonia                    83
Name: gs_text34, dtype: int64

# Up-Sample
Observations are sampled such that all the original observations have a frequency of at least 1. This ensures that any rare combination of endorsements occur in the training data. Each cause is sampled to a frequency equal to that of the most frequent cause.

In [5]:
# Use a RandomState for reproducibility
rs = np.random.RandomState(777)

# Upsample, so start with everything that current exists in the file
new_index = [neonate_validated.index.values]

for cause in counts_by_cause.index.unique():
    indicies = neonate_validated.loc[neonate_validated.gs_text34 == cause].index.values
    needed = counts_by_cause.max() - counts_by_cause.loc[cause]
    new_index.append(rs.choice(indicies, needed))

new_index = np.concatenate(new_index)

In [6]:
neonate_uniform = neonate_validated.loc[new_index]
neonate_uniform.gs_text34.value_counts()

Pneumonia                  1005
Birth asphyxia             1005
Stillbirth                 1005
Congenital malformation    1005
Preterm Delivery           1005
Meningitis/Sepsis          1005
Name: gs_text34, dtype: int64

# Frequencies Dict
In an effort to make a clean diff, I'm going to list the frequencies in the order in which the currently appear in the tariff data file. Since the dict will be unordered once imported, I'm going to read the python file as plain text.

In [7]:
freqs = neonate_uniform.sid.value_counts()
freqs.head()

D-5000598001    21
B-3197          18
P-G4206         18
D-5000838002    17
B-10369         17
Name: sid, dtype: int64

In [8]:
sids = []
with open('../../smartva/data/neonate_tariff_data.py', 'r') as f:
    line = f.readline()
    while line and line != 'FREQUENCIES = {\n':
        line = f.readline()
        
    for line in f:
        if line == '}\n':
            break
        sid = line.strip().split(': ')[0].strip('"')
        sids.append(sid)

In [9]:
sids[:5]

['U-784', 'U-785', 'P-F9014', 'B-609', 'U-780']

In [10]:
for sid in sids:
    assert sid in freqs, '{}'.format(sid)

In [11]:
with open('new_neonate_freqs.txt', 'w') as f:
    for sid in sids:
        f.write('    "{}": {},\n'.format(sid, freqs.loc[sid]))