# Extract Labels

- Pull out labels and save them for loading later

In [136]:
%store -r dataset

dataset = dataset[:10]

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 25 columns):
abstract                   10 non-null object
pmid                       10 non-null object
allocation                 7 non-null category
endpoint_classification    8 non-null category
intervention_model         10 non-null category
masking                    10 non-null category
primary_purpose            10 non-null category
condition                  10 non-null object
gender                     10 non-null category
healthy_volunteers         10 non-null category
maximum_age                10 non-null object
minimum_age                10 non-null object
phase                      10 non-null category
study_type                 10 non-null category
primary_outcome_measure    10 non-null category
ec_bio                     10 non-null category
ec_pharmaco                10 non-null category
ec_efficacy_study          10 non-null category
ec_safety_study            10 non-null categor

### Select all of the Labels Suitable for Prediction

In [137]:
labels = [#'allocation',
#           'ec_efficacy_study',
#           'ec_safety_study',
#           'intervention_model',
#           'masking',
#           'primary_purpose',
          'gender',
#           'healthy_volunteers',
#           'phase_NA',
#           'phase_1',
          'phase_2',
#           'phase_3',
#           'phase_4'
]

dataset = dataset[labels]

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 2 columns):
gender     10 non-null category
phase_2    10 non-null category
dtypes: category(2)
memory usage: 140.0 bytes


### Binarize the Data

In [138]:
binarized_dataset = dataset.copy()

for label in labels:
    binarized_dataset[label] = binarized_dataset[label].cat.codes

binarized_dataset

Unnamed: 0,gender,phase_2
0,0,0
1,0,0
2,0,1
3,0,0
4,0,1
5,0,1
6,0,0
7,1,0
8,1,0
9,1,1


### Save It!

In [143]:
pruned_dataset = dataset

%store pruned_dataset
%store binarized_dataset

Stored 'pruned_dataset' (DataFrame)
Stored 'binarized_dataset' (DataFrame)


### Take Only a Minimal Subset of the Dataset

This is useful if we want to debug a minimal subset of the data which contains at least one instance of each class. If you're not interested in excluding some of the data, then skip this cell.

In [14]:
def examples_generator(dataset, target='gender', num_examples=None):
    """Generate indexes into dataset to pull out examples of classes
    
    Generate n examples for each class where n is the number of examples for the class
    we have the fewest examples for.
    
    """
    labels = dataset[target].unique()
    
    if not num_examples:
        num_class_examples = dataset.groupby('gender').size()
        num_examples = min(num_class_examples) # only get a number of examples such that we have perfect class balance
    
    for label in labels:
        for idx, entry in dataset[dataset[target] == label][:num_examples].iterrows():
            yield idx

dataset = dataset.loc[list(examples_generator(dataset, num_examples=50))]

dataset.groupby('gender').size()

gender
Both      50
Female    50
Male      50
dtype: int64