# Learning objectives
- Orient on the Oregon dataset
- Recode variables using .replace()
- Define ordered categorical variables
- Rename variables

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Make Google Drive available to the script
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Let's load the Oregon Health Insurance Experiment dataset
filename = 'drive/MyDrive/Colab Notebooks/Intro to Python for Epidemiologists/Data/OHIE_12m.csv'
OHIE = pd.read_csv(filename)
OHIE.head()

Unnamed: 0,person_id,household_id,treatment,draw_treat,draw_lottery,applied_app,approved_app,dt_notify_lottery,dt_retro_coverage,birthyear_list,...,live_partner_12m,live_parents_12m,live_friends_12m,live_relatives_12m,live_other_12m,hhsize_12m,PHQ2_1,PHQ2_2,PHQ2_sum,PHQ2_cutoff
0,64350,164350,Not selected,,Lottery Draw 6,,,2008-07-14,2008-08-08,1974,...,No,Yes,No,No,No,2.0,3.0,3.0,6.0,True
1,55655,155655,Not selected,,Lottery Draw 7,,,2008-08-12,2008-09-08,1987,...,Yes,No,No,No,No,2.0,1.0,1.0,2.0,False
2,20087,128134,Selected,Draw 6: selected in lottery 07/01/2008,Lottery Draw 6,Submitted an Application to OHP,No,2008-07-14,2008-08-08,1963,...,No,No,No,Yes,No,7.0,0.0,1.0,1.0,False
3,70998,170998,Not selected,,Lottery Draw 7,,,2008-08-12,2008-09-08,1954,...,Yes,No,No,No,No,2.0,3.0,2.0,5.0,True
4,8839,108839,Selected,Draw 8: selected in lottery 09/02/2008,Lottery Draw 8,Did NOT submit an application to OHP,No,2008-09-11,2008-10-08,1964,...,No,No,Yes,No,No,4.0,2.0,2.0,4.0,True


# Orient on a new dataset

In [4]:
OHIE.shape

(4000, 44)

In [5]:
OHIE.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 44 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   person_id            4000 non-null   int64  
 1   household_id         4000 non-null   int64  
 2   treatment            4000 non-null   object 
 3   draw_treat           2011 non-null   object 
 4   draw_lottery         4000 non-null   object 
 5   applied_app          2010 non-null   object 
 6   approved_app         2010 non-null   object 
 7   dt_notify_lottery    4000 non-null   object 
 8   dt_retro_coverage    4000 non-null   object 
 9   birthyear_list       4000 non-null   int64  
 10  female_list          4000 non-null   object 
 11  ins_any_12m          3939 non-null   object 
 12  weight_12m           4000 non-null   float64
 13  employ_12m           3868 non-null   object 
 14  edu_12m              3853 non-null   object 
 15  dep_sad_12m          3936 non-null   o

# Recode variables using .replace

In [7]:
# Treatment: recode this to 0 and 1
OHIE['treatment'].head(3)

Unnamed: 0,treatment
0,Not selected
1,Not selected
2,Selected


In [9]:
OHIE['treatment'] = OHIE['treatment'].replace({'Not selected':0, 'Selected':1}).astype(int)
OHIE['treatment'].head(3)

  OHIE['treatment'] = OHIE['treatment'].replace({'Not selected':0, 'Selected':1}).astype(int)


Unnamed: 0,treatment
0,0
1,0
2,1


# Ordered categorical variables

In [10]:
OHIE['edu_12m'].head()

Unnamed: 0,edu_12m
0,4-year degree
1,hs diploma or GED
2,hs diploma or GED
3,hs diploma or GED
4,hs diploma or GED


In [11]:
OHIE['edu_12m'].value_counts()

Unnamed: 0_level_0,count
edu_12m,Unnamed: 1_level_1
hs diploma or GED,1884
vocational or 2-year degree,855
less than hs,665
4-year degree,449


In [16]:
edu_order = ['less than hs','hs diploma or GED','vocational or 2-year degree','4-year degree']

In [17]:
OHIE['EducationOrdered'] = pd.Categorical(
    OHIE['edu_12m'],
    categories = edu_order,
    ordered = True
)
OHIE['EducationOrdered'].dtype

CategoricalDtype(categories=['less than hs', 'hs diploma or GED',
                  'vocational or 2-year degree', '4-year degree'],
, ordered=True, categories_dtype=object)

In [21]:
OHIE['EduNum'] = OHIE['EducationOrdered'].cat.codes

In [22]:
OHIE[['edu_12m','EducationOrdered','EduNum']].head(10)

Unnamed: 0,edu_12m,EducationOrdered,EduNum
0,4-year degree,4-year degree,3
1,hs diploma or GED,hs diploma or GED,1
2,hs diploma or GED,hs diploma or GED,1
3,hs diploma or GED,hs diploma or GED,1
4,hs diploma or GED,hs diploma or GED,1
5,hs diploma or GED,hs diploma or GED,1
6,hs diploma or GED,hs diploma or GED,1
7,4-year degree,4-year degree,3
8,less than hs,less than hs,0
9,hs diploma or GED,hs diploma or GED,1


# Rename columns

In [23]:
OHIE.columns

Index(['person_id', 'household_id', 'treatment', 'draw_treat', 'draw_lottery',
       'applied_app', 'approved_app', 'dt_notify_lottery', 'dt_retro_coverage',
       'birthyear_list', 'female_list', 'ins_any_12m', 'weight_12m',
       'employ_12m', 'edu_12m', 'dep_sad_12m', 'dep_interest_12m',
       'dep_rx_12m', 'smk_curr_12m', 'smk_ever_12m', 'race_white_12m',
       'race_black_12m', 'race_hisp_12m', 'race_asian_12m',
       'race_amerindian_12m', 'race_pacific_12m', 'race_other_qn_12m',
       'chl_chk_12m', 'dia_chk_12m', 'mam_chk_12m', 'pap_chk_12m',
       'hhinc_cat_12m', 'hhinc_pctfpl_12m', 'live_alone_12m',
       'live_partner_12m', 'live_parents_12m', 'live_friends_12m',
       'live_relatives_12m', 'live_other_12m', 'hhsize_12m', 'PHQ2_1',
       'PHQ2_2', 'PHQ2_sum', 'PHQ2_cutoff', 'EducationLevel',
       'EducationOrdered', 'EduNum'],
      dtype='object')

In [25]:
OHIE = OHIE.rename(columns = {'birthyear_list':'birthyear'})
OHIE.columns

Index(['person_id', 'household_id', 'treatment', 'draw_treat', 'draw_lottery',
       'applied_app', 'approved_app', 'dt_notify_lottery', 'dt_retro_coverage',
       'birthyear', 'female_list', 'ins_any_12m', 'weight_12m', 'employ_12m',
       'edu_12m', 'dep_sad_12m', 'dep_interest_12m', 'dep_rx_12m',
       'smk_curr_12m', 'smk_ever_12m', 'race_white_12m', 'race_black_12m',
       'race_hisp_12m', 'race_asian_12m', 'race_amerindian_12m',
       'race_pacific_12m', 'race_other_qn_12m', 'chl_chk_12m', 'dia_chk_12m',
       'mam_chk_12m', 'pap_chk_12m', 'hhinc_cat_12m', 'hhinc_pctfpl_12m',
       'live_alone_12m', 'live_partner_12m', 'live_parents_12m',
       'live_friends_12m', 'live_relatives_12m', 'live_other_12m',
       'hhsize_12m', 'PHQ2_1', 'PHQ2_2', 'PHQ2_sum', 'PHQ2_cutoff',
       'EducationLevel', 'EducationOrdered', 'EduNum'],
      dtype='object')