Import the pandas library and load in the data file
---

In [14]:
import pandas as pd

demo = pd.read_csv('Demographics.csv')

Replace strings with numerical code
---

In [15]:
replace_dict = {'DMQMILIT': {
                              'Yes':1, 
                              'No':2,
                              'Refused':7,
                              "Don't know":9
                            },
                'DMDCITZN': {
                              'Citizen by birth or naturalization':1, 
                              'Not a citizen of the US':2,
                              'Refused':7,
                              "Don't know":9
                            }
               }

demo.replace(replace_dict, inplace=True)

print(demo['DMQMILIT'].unique())
print(demo['DMDCITZN'].unique())

[nan  1.  2.  9.  7.]
[ 1.  2.  7.  9. nan]


Replace column names with human-readable strings
---

In [11]:
column_dict = {
    'DMQMILIT':'Veteran/Military Status',
    'DMDCITZN':'Citizenship Status'}


demo.rename(columns=column_dict, inplace=True)

demo.columns[10:15]

Index(['Veteran/Military Status', 'DMDBORN', 'Citizenship Status', 'DMDYRSUS',
       'DMDEDUC3'],
      dtype='object')

View new column types
---

In [12]:
demo.loc[:,['Veteran/Military Status', 'Citizenship Status']].dtypes

Veteran/Military Status    float64
Citizenship Status         float64
dtype: object

Individual Practice
---

1. Convert the columns with string entries to the numeric codes listed in the demographics codebook. 
2. Return all column names to the original ones used in the codebook.

In [19]:
typeCol=(demo.dtypes==object)

typeCol.loc[typeCol==True]

DMDSCHOL    True
dtype: bool

In [22]:
print(demo['DMDSCHOL'].unique())

[nan '1' 3 '2' '3' '9']


In [21]:
replace_dict = {'DMDSCHOL': {
                              'Neither':3
                            }
               }

demo.replace(replace_dict, inplace=True)

In [17]:
print(demo['RIDSTATR'].unique())

['Both Interviewed and MEC examined' 'Interviewed Only' nan]


In [18]:
replace_dict = {'RIDSTATR': {
                              'Interviewed Only':1, 
                              'Both Interviewed and MEC examined':2
                            },
                'RIAGENDR': {
                              'Male':1, 
                              'Female':2
                            }
               }

demo.replace(replace_dict, inplace=True)

print(demo['RIDSTATR'].unique())
print(demo['RIAGENDR'].unique())

[ 2.  1. nan]
[ 2.  1. nan]


In [16]:
demo.loc[:,['RIDSTATR', 'RIAGENDR']].dtypes

RIDSTATR    float64
RIAGENDR    float64
dtype: object

In [3]:
replace_dict = {'RIDRETH1': {
                              'Mexican American':1, 
                              'Other Hispanic':2,
                              'Non-Hispanic White':3,
                              'Non-Hispanic Black':4,
                              'Other Race - Including Multi-Racial':5  
                            },
                'DMDBORN': {
                              'Born in 50 US States or Washington, DC':1, 
                              'Born in Mexico':2,
                              'Born Elsewhere':3,  
                              'Refused':7,
                              "Don't know":9
                            }
               }

demo.replace(replace_dict, inplace=True)

print(demo['RIDRETH1'].unique())
print(demo['DMDBORN'].unique())

[ 4.  3.  5.  1.  2. nan]
[ 1.  3.  2. nan  7.  9.]


In [4]:
demo.loc[:,['RIDRETH1', 'DMDBORN']].dtypes

RIDRETH1    float64
DMDBORN     float64
dtype: object

In [9]:
replace_dict = { #'DMDYRSUS': {
#                               'Less than 1 year':1, 
#                               '1 yr., less than 5 yrs.':2,
#                               '5 yrs., less than 10 yrs.':3,
#                               '10 yrs., less than 15 yrs.':4,
#                               '15 yrs., less than 20 yrs.':5,
#                               '20 yrs., less than 30 yrs.':6, 
#                               '30 yrs., less than 40 yrs.':7,
#                               '40 yrs., less than 50 yrs.':8,
#                               '50 years or more':9,
#                               'Refused':77,
#                               'Could not determine':88,
#                               "Don't know":99
#                             },
                'DMDEDUC3': {
#                               'Never Attended / Kindergarten Only':0,
#                               '1st Grade':1, 
#                               '2nd Grade':2,
#                               '3rd Grade':3,
#                               '4th Grade':4,
#                               '5th Grade':5,
#                               '6th Grade':6, 
#                               '7th Grade':7,
#                               '8th Grade':8,
#                               '9th Grade':9,
#                               '10th Grade':10, 
#                               '11th Grade':11,
                              '12th Grade, No Diploma':12,
#                               'High School Graduate':13,
#                               'GED or Equivalent':14,
#                               'More than high school':15, 
#                               'Less Than 5th Grade':55,
#                               'Less Than 9th Grade':66,
#                               'Refused':77,
#                               "Don't know":99
                            }
                }

demo.replace(replace_dict, inplace=True)

print(demo['DMDYRSUS'].unique())
print(demo['DMDEDUC3'].unique())

[nan  2.  3.  6.  8.  4.  7.  9.  5.  1. 77. 99. 88.]
[nan  3. 15.  5.  8. 13.  7. 11.  1.  9.  6.  0. 12. 10.  4.  2. 14. 66.
 55. 77. 99.]


In [12]:
replace_dict = {'DMDEDUC2': {
#                               'Less Than 9th Grade':1, 
#                               '9-11th Grade (Includes 12th grade with no diploma)':2,
#                               'High School Grad/GED or Equivalent':3,
#                               'Some College or AA degree':4,
#                               'College Graduate or above':5,
#                               'Refused':7,
                              "Don't Know":9
                            },
                'DMDSCHOL': {
#                               'In school':1, 
#                               'On vacation from school (between grades)':2,
#                               'Neither in school or on vacation from school (between grades)':3,
#                               'Refused':7,
                              "Don't Know":9
                            },
                'DMDMARTL': {
#                               'Married':1, 
#                               'Widowed':2,
#                               'Divorced':3,
#                               'Separated':4, 
#                               'Never married':5,
#                               'Living with partner':6,
#                               'Refused':77,
                              "Don't Know":99
                            }
               }

demo.replace(replace_dict, inplace=True)

print(demo['DMDEDUC2'].unique())
print(demo['DMDSCHOL'].unique())
print(demo['DMDMARTL'].unique())

[nan  5.  2.  3.  4.  1.  9.  7.]
[nan 1 'Neither' 2 3 9]
[nan  1.  5.  4.  3.  2.  6. 77. 99.]


In [11]:
grouped = demo.groupby('DMDSCHOL')
grouped.size()

DMDSCHOL
1             2453
2              645
3               83
Don't Know       2
Neither        282
dtype: int64

Save data files
---

In [24]:
demo.to_csv('Demographics.csv', index=False)