In [1]:
# import pandas, numpy, and pyreadstat
import pandas as pd
import numpy as np
import pyreadstat

In [2]:
pd.set_option('display.max_columns', 5)
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 75)

In [3]:
# retrieve spss data, along with the meta data
nls97spss, metaspss = pyreadstat.read_sav('data/nls97.sav')

In [4]:
nls97spss.dtypes

R0000100    float64
R0536300    float64
R0536401    float64
R0536402    float64
R1235800    float64
R1482600    float64
R9793800    float64
R9793900    float64
R9871900    float64
R9872000    float64
R9872200    float64
R9872400    float64
S8646900    float64
S8647000    float64
S8647100    float64
S8647200    float64
S8647300    float64
S8647400    float64
S8647500    float64
S8647600    float64
S8647700    float64
S8647800    float64
T6651700    float64
U1836800    float64
U1836900    float64
U1837000    float64
U1837100    float64
U1837200    float64
U1837300    float64
U1845400    float64
U1852400    float64
U1852600    float64
U1852700    float64
U2166200    float64
U2166300    float64
U2166400    float64
U2166500    float64
U2857300    float64
U2962800    float64
U2962900    float64
U2963000    float64
Z9063900    float64
dtype: object

In [5]:
nls97spss.head()

Unnamed: 0,R0000100,R0536300,...,U2963000,Z9063900
0,1.0,2.0,...,,52.0
1,2.0,1.0,...,6.0,0.0
2,3.0,2.0,...,6.0,0.0
3,4.0,2.0,...,6.0,4.0
4,5.0,1.0,...,5.0,12.0


In [6]:
nls97spss['R0536300'].value_counts(normalize=True)

1.00   0.51
2.00   0.49
Name: R0536300, dtype: float64

In [7]:
# use column labels and value labels
metaspss.variable_value_labels['R0536300']

{0.0: 'No Information', 1.0: 'Male', 2.0: 'Female'}

In [8]:
nls97spss['R0536300'].\
  map(metaspss.variable_value_labels['R0536300']).\
  value_counts(normalize=True)

Male     0.51
Female   0.49
Name: R0536300, dtype: float64

In [9]:
nls97spss = pyreadstat.set_value_labels(nls97spss, metaspss, formats_as_category=True)
nls97spss.columns = metaspss.column_labels
nls97spss['KEY!SEX (SYMBOL) 1997'].value_counts(normalize=True)

Male     0.51
Female   0.49
Name: KEY!SEX (SYMBOL) 1997, dtype: float64

In [10]:
nls97spss.dtypes

PUBID - YTH ID CODE 1997                        float64
KEY!SEX (SYMBOL) 1997                          category
KEY!BDATE M/Y (SYMBOL) 1997                     float64
KEY!BDATE M/Y (SYMBOL) 1997                     float64
CV_SAMPLE_TYPE 1997                            category
KEY!RACE_ETHNICITY (SYMBOL) 1997               category
TRANS_SAT_VERBAL HSTR                           float64
TRANS_SAT_MATH HSTR                             float64
TRANS CRD GPA OVERALL HSTR                      float64
TRANS CRD GPA ENG HSTR                          float64
TRANS CRD GPA MATH HSTR                         float64
TRANS CRD GPA LP SCI HSTR                       float64
GOVT RESPONSIBILITY - PROVIDE JOBS 2006        category
GOVT RESPNSBLTY - KEEP PRICES UND CTRL 2006    category
GOVT RESPNSBLTY - HLTH CARE FOR SICK 2006      category
GOVT RESPNSBLTY -PROV ELD LIV STAND 2006       category
GOVT RESPNSBLTY -PROV IND HELP 2006            category
GOVT RESPNSBLTY -PROV UNEMP LIV STAND 2006     c

In [11]:
nls97spss.columns = nls97spss.columns.\
    str.lower().\
    str.replace(' ','_').\
    str.replace('[^a-z0-9_]', '')

In [12]:
nls97spss.set_index('pubid__yth_id_code_1997', inplace=True)

In [13]:
# apply the formats from the beginning
nls97spss, metaspss = pyreadstat.read_sav('data/nls97.sav', apply_value_formats=True, formats_as_category=True)
nls97spss.columns = metaspss.column_labels
nls97spss.columns = nls97spss.columns.\
  str.lower().\
  str.replace(' ','_').\
  str.replace('[^a-z0-9_]', '')
nls97spss.dtypes

pubid__yth_id_code_1997                        float64
keysex_symbol_1997                            category
keybdate_my_symbol_1997                        float64
keybdate_my_symbol_1997                        float64
cv_sample_type_1997                           category
keyrace_ethnicity_symbol_1997                 category
trans_sat_verbal_hstr                          float64
trans_sat_math_hstr                            float64
trans_crd_gpa_overall_hstr                     float64
trans_crd_gpa_eng_hstr                         float64
trans_crd_gpa_math_hstr                        float64
trans_crd_gpa_lp_sci_hstr                      float64
govt_responsibility__provide_jobs_2006        category
govt_respnsblty__keep_prices_und_ctrl_2006    category
govt_respnsblty__hlth_care_for_sick_2006      category
govt_respnsblty_prov_eld_liv_stand_2006       category
govt_respnsblty_prov_ind_help_2006            category
govt_respnsblty_prov_unemp_liv_stand_2006     category
govt_respn

In [14]:
nls97spss.head()

Unnamed: 0,pubid__yth_id_code_1997,keysex_symbol_1997,...,hrsnight_r_sleeps_2017,cvc_wkswk_yr_all_l99
0,1.0,Female,...,,52.0
1,2.0,Male,...,6.0,0.0
2,3.0,Female,...,6.0,0.0
3,4.0,Female,...,6.0,4.0
4,5.0,Male,...,5.0,12.0


In [15]:
nls97spss.govt_responsibility__provide_jobs_2006.\
  value_counts(sort=False)

Definitely should be        454
Definitely should not be    300
Probably should be          617
Probably should not be      462
Name: govt_responsibility__provide_jobs_2006, dtype: int64

In [16]:
nls97spss.set_index('pubid__yth_id_code_1997', inplace=True)

In [17]:
# do the same for stata data
nls97stata, metastata = pyreadstat.read_dta('data/nls97.dta', apply_value_formats=True, formats_as_category=True)
nls97stata.columns = metastata.column_labels
nls97stata.columns = nls97stata.columns.\
    str.lower().\
    str.replace(' ','_').\
    str.replace('[^a-z0-9_]', '')
nls97stata.dtypes

pubid__yth_id_code_1997                        float64
keysex_symbol_1997                            category
keybdate_my_symbol_1997                        float64
keybdate_my_symbol_1997                        float64
cv_sample_type_1997                           category
keyrace_ethnicity_symbol_1997                 category
trans_sat_verbal_hstr                          float64
trans_sat_math_hstr                            float64
trans_crd_gpa_overall_hstr                     float64
trans_crd_gpa_eng_hstr                         float64
trans_crd_gpa_math_hstr                        float64
trans_crd_gpa_lp_sci_hstr                      float64
govt_responsibility__provide_jobs_2006        category
govt_respnsblty__keep_prices_und_ctrl_2006    category
govt_respnsblty__hlth_care_for_sick_2006      category
govt_respnsblty_prov_eld_liv_stand_2006       category
govt_respnsblty_prov_ind_help_2006            category
govt_respnsblty_prov_unemp_liv_stand_2006     category
govt_respn

In [18]:
nls97stata.head()

Unnamed: 0,pubid__yth_id_code_1997,keysex_symbol_1997,...,hrsnight_r_sleeps_2017,cvc_wkswk_yr_all_l99
0,1.0,Female,...,-5.0,52.0
1,2.0,Male,...,6.0,0.0
2,3.0,Female,...,6.0,0.0
3,4.0,Female,...,6.0,4.0
4,5.0,Male,...,5.0,12.0


In [19]:
nls97stata.govt_responsibility__provide_jobs_2006.\
  value_counts(sort=False)

-5.0                        1425
-4.0                        5665
-2.0                          56
-1.0                           5
Definitely should be         454
Definitely should not be     300
Probably should be           617
Probably should not be       462
Name: govt_responsibility__provide_jobs_2006, dtype: int64

In [20]:
nls97stata.min()

pubid__yth_id_code_1997          1.00
keybdate_my_symbol_1997          1.00
keybdate_my_symbol_1997      1,980.00
trans_sat_verbal_hstr           -4.00
trans_sat_math_hstr             -4.00
trans_crd_gpa_overall_hstr      -9.00
trans_crd_gpa_eng_hstr          -9.00
trans_crd_gpa_math_hstr         -9.00
trans_crd_gpa_lp_sci_hstr       -9.00
cv_ba_credits_l1_2011           -5.00
cv_bio_child_hh_2017            -5.00
cv_bio_child_nr_2017            -5.00
hrsnight_r_sleeps_2017          -5.00
cvc_wkswk_yr_all_l99            -4.00
dtype: float64

In [21]:
nls97stata.replace(list(range(-9,0)), np.nan, inplace=True)

In [22]:
nls97stata.min()

pubid__yth_id_code_1997          1.00
keybdate_my_symbol_1997          1.00
keybdate_my_symbol_1997      1,980.00
trans_sat_verbal_hstr           14.00
trans_sat_math_hstr              7.00
trans_crd_gpa_overall_hstr      10.00
trans_crd_gpa_eng_hstr           0.00
trans_crd_gpa_math_hstr          0.00
trans_crd_gpa_lp_sci_hstr        0.00
cv_ba_credits_l1_2011            0.00
cv_bio_child_hh_2017             0.00
cv_bio_child_nr_2017             0.00
hrsnight_r_sleeps_2017           0.00
cvc_wkswk_yr_all_l99             0.00
dtype: float64

In [23]:
nls97stata.set_index('pubid__yth_id_code_1997', inplace=True)

In [24]:
# pull sas data, using the sas catalog file for value labels
nls97sas, metasas = pyreadstat.read_sas7bdat('data/nls97.sas7bdat', catalog_file='data/nlsformats3.sas7bcat', formats_as_category=True)
nls97sas.columns = metasas.column_labels
nls97sas.columns = nls97sas.columns.\
    str.lower().\
    str.replace(' ','_').\
    str.replace('[^a-z0-9_]', '')
nls97sas.head()

Unnamed: 0,pubid__yth_id_code_1997,keysex_symbol_1997,...,hrsnight_r_sleeps_2017,cvc_wkswk_yr_all_l99
0,1.0,Female,...,,52.0
1,2.0,Male,...,6.0,0.0
2,3.0,Female,...,6.0,0.0
3,4.0,Female,...,6.0,4.0
4,5.0,Male,...,5.0,12.0


In [25]:
nls97sas.keysex_symbol_1997.value_counts()

Male      4599
Female    4385
Name: keysex_symbol_1997, dtype: int64

In [26]:
nls97sas.set_index('pubid__yth_id_code_1997', inplace=True)

In [27]:
nls97sas.head()

Unnamed: 0_level_0,keysex_symbol_1997,keybdate_my_symbol_1997,...,hrsnight_r_sleeps_2017,cvc_wkswk_yr_all_l99
pubid__yth_id_code_1997,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,Female,9.0,...,,52.0
2.0,Male,7.0,...,6.0,0.0
3.0,Female,9.0,...,6.0,0.0
4.0,Female,2.0,...,6.0,4.0
5.0,Male,10.0,...,5.0,12.0
