In [4]:
import pandas as pd
import numpy as np
import sys
import os
import math
from sklearn.model_selection import train_test_split

sys.path.append(os.environ['CMS_ROOT'])

pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

### Definitions

In [5]:
# shared data (raw)
# shared_medicare_dir = '/home/groups/fau-bigdata-datasets/medicare/combined'
shared_medicare_dir = '/Users/jujohnson/cms-data/aggregated'
csv_file = '20190814_NPI-level_2013_to_2017_Medicare_COMBINED_aggregated_with_LEIE_one-hot_encoding.csv'

In [6]:
hdf5_file = '../data/combined-minmax-scaled.hdf5'

raw_key = 'raw'
train_key = 'train_normalized'
test_key = 'test_normalized'

### Load Training Data

In [7]:
data = pd.read_hdf(hdf5_file, train_key)

In [8]:
pos_count = len(data.loc[data['class'] == 1])
neg_count = len(data.loc[data['class'] == 0])
print(pos_count, neg_count)
total = pos_count + neg_count

5317 2616175


In [9]:
print('Minority ratio:\n', pos_count / (total) * 100)

Minority ratio:
 0.202823430321359


### RUS Rates

In [10]:
positive_ratios = [0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90]

In [11]:
pos_count = len(data.loc[data['class'] == 1])
neg_count = len(data.loc[data['class'] == 0])

def get_rus_rate(pos_ratio):
    return (pos_count * (1 - pos_ratio)) / (pos_ratio * neg_count)

In [15]:
rus_rates = [get_rus_rate(ratio / 100) for ratio in positive_ratios]
list(zip(positive_ratios,rus_rates))

[(0.01, 20.3215316253691),
 (0.025, 8.127393236308732),
 (0.05, 4.062680439955279),
 (0.1, 2.0303240417785506),
 (0.25, 0.8109102028725144),
 (0.5, 0.4044389232371688),
 (0.75, 0.2689484966920536),
 (1, 0.201203283419496),
 (5, 0.03861477156535782),
 (10, 0.018291207583590548),
 (20, 0.00812942559270691),
 (30, 0.00474216492907903),
 (40, 0.003048534597265091),
 (50, 0.0020323563981767276),
 (60, 0.0013549042654511518),
 (70, 0.0008710098849328833),
 (80, 0.0005080890995441818),
 (90, 0.0002258173775751919)]

### ROS Rates

In [16]:
positive_ratios = [0.1, 0.5, 1, 20, 40, 50, 60]

In [17]:
pos_count = len(data.loc[data['class'] == 1])
neg_count = len(data.loc[data['class'] == 0])

def get_ros_rate(pos_ratio):
    return (pos_ratio * neg_count) / (pos_count * (1 - pos_ratio))

In [18]:
ros_rates = [get_ros_rate(ratio / 100) for ratio in positive_ratios]
ros_rates

[0.4925322162485977,
 2.4725612262932115,
 4.9700978185085765,
 123.00992100808726,
 328.02645602156605,
 492.03968403234904,
 738.0595260485236]

### ROS-RUS Rates

In [19]:
negative_ratios = [0.5, 0.25, 0.1]

pos_count = len(data.loc[data['class'] == 1])
ros_rates = []

for neg_ratio in negative_ratios:
    neg_count = len(data.loc[data['class'] == 0]) * neg_ratio
    ros_rates.append(get_ros_rate(0.5))

In [20]:
ros_rates

[246.01984201617452, 123.00992100808726, 49.20396840323491]