In [1]:
import pandas as pd
import numpy as np

# Demographic Variables and Sample Weights (DEMO_H)
RIAGENDR - Gender
RIDAGEYR - Age in years at screening
female = 2

# Medical Conditions (MCQ_H)
SEQN - Respondent sequence number
- MCQ230A - What kind of cancer (Code 14 is Breast, many codes many different cancers)
- MCQ240E - Age when breast cancer first diagnosed
- MCQ220 - Ever told you had cancer or malignancy
no = 2


# Body Measures (BMX_H)
SEQN - Respondent sequence number
BMXBMI - Body Mass Index (kg/m**2)


# Complete Blood Count with 5-part Differential - Whole Blood (CBC_H)
SEQN - Respondent sequence number
LBXWBCSI - White blood cell count (1000 cells/uL)
LBXLYPCT - Lymphocyte percent (%)
LBXMOPCT - Monocyte percent (%)
LBXNEPCT - Segmented neutrophils percent (%)
LBXEOPCT - Eosinophils percent (%)
LBXBAPCT - Basophils percent (%)
LBDLYMNO - Lymphocyte number (1000 cells/uL)
LBDMONO - Monocyte number (1000 cells/uL)
LBDNENO - Segmented neutrophils num (1000 cell/uL)
LBDEONO - Eosinophils number (1000 cells/uL)
LBDBANO - Basophils number (1000 cells/uL)
LBXRBCSI - Red blood cell count (million cells/uL)
LBXHGB - Hemoglobin (g/dL)
LBXHCT - Hematocrit (%)
LBXMCVSI - Mean cell volume (fL)
LBXMCHSI - Mean cell hemoglobin (pg)
LBXMC - MCHC (g/dL)
LBXRDW - Red cell distribution width (%)
LBXPLTSI - Platelet count (1000 cells/uL)
LBXMPSI - Mean platelet volume (fL)
 

# Standard Biochemistry Profile (BIOPRO_H)
SEQN - Respondent sequence number
LBDSALSI - Albumin (g/L)
LBXSAPSI - Alkaline phosphatase (IU/L)
LBXSASSI - Aspartate aminotransferase AST (IU/L)
LBXSATSI - Alanine aminotransferase ALT (IU/L)
LBXSBU - Blood urea nitrogen (mg/dL)
LBDSBUSI - Blood urea nitrogen (mmol/L)
LBXSC3SI - Bicarbonate (mmol/L)
LBDSCASI - Total calcium (mmol/L)
LBDSCHSI - Cholesterol (mmol/L)
LBXSCK - Creatine Phosphokinase(CPK) (IU/L)
LBXSCLSI - Chloride (mmol/L)
LBDSCRSI - Creatinine (umol/L)
LBDSGBSI - Globulin (g/L)
LBDSGLSI - Glucose, refrigerated serum (mmol/L)
LBXSGTSI - Gamma glutamyl transferase (U/L)
LBDSIRSI - Iron, refrigerated serum (umol/L)
LBXSKSI - Potassium (mmol/L)
LBXSLDSI - Lactate dehydrogenase (U/L)
LBXSNASI - Sodium (mmol/L)
LBXSOSSI - Osmolality (mmol/Kg)
LBDSPHSI - Phosphorus (mmol/L)
LBDSTBSI - Total bilirubin (umol/L)
LBDSTPSI - Total protein (g/L)
LBDSTRSI - Triglycerides, refrigerated (mmol/L)
LBDSUASI - Uric acid (umol/L)

# GET MORE DATA

In [2]:
for i in ['A']:    
    df_demo= pd.read_sas('./NHANES 2013-2014/other NHANES/DEMO_'+i+'.XPT')
    # selecting specific columns
    # selecting gender: women SEQN and Age
    df_demo = df_demo[(df_demo['RIAGENDR'] == 2 ) & (df_demo['RIDAGEYR'] > 10) ][['SEQN', 'RIDAGEYR']]
    df_demo = df_demo.rename(columns={'RIDAGEYR': 'Age'})

    df_med = pd.read_sas('./NHANES 2013-2014/other NHANES/MCQ_'+i+'.XPT')
    # match only dose SEQN matching the one inf df_demo (seqn variable)
    df_med = df_med.loc[df_med['SEQN'].isin(df_demo['SEQN'])]
    df_med = df_med[['SEQN', 'MCQ230A', 'MCQ240E', 'MCQ220']]
    # add column breast cancer 'Diagnosis' 1 no 0
    df_med['Breast Cancer'] = np.where(df_med['MCQ230A'] == 14,1,0)

    # subset to have women that were diagnosed with breast cancer 
    df_med = df_med[(df_med['Breast Cancer'] == 1)]
    # changing the column name to diagnosis age and saving the dataset
    df_med = df_med.rename(columns={'MCQ240E' : 'Diagnosis Age'})[['SEQN', 'Diagnosis Age', 'Breast Cancer']]
    # merge df_med with df_demo
    df = pd.merge(df_demo, df_med, on='SEQN', how='inner')

    df_bmx = pd.read_sas('./NHANES 2013-2014/other NHANES/BMX_'+i+'.XPT')
    # chose the ones with SEQN from df_med
    df_bmx = df_bmx.loc[df_bmx['SEQN'].isin(df_med['SEQN'].tolist())][['SEQN', 'BMXBMI']]
    # rename the column
    df_bmx = df_bmx.rename(columns={'BMXBMI': 'BMI' })
    # join df_bmx and df_med into one df
    df = pd.merge(df, df_bmx, how='inner', on=['SEQN'])
    df.info()

    df_blood = pd.read_sas('./NHANES 2013-2014/other NHANES/CBC_'+i+'.XPT')
    # subset to SEQN from df
    df_blood = df_blood.loc[df_blood['SEQN'].isin(df['SEQN'])]
    col_dict = {
    'LBXWBCSI' : 'BL_White blood cell count (1000 cells/uL)',
    'LBXLYPCT' : 'BL_Lymphocyte percent (%)',
    'LBXMOPCT' : 'BL_Monocyte percent (%)',
    'LBXNEPCT' : 'BL_Segmented neutrophils percent (%)',
    'LBXEOPCT' : 'BL_Eosinophils percent (%)',
    'LBXBAPCT' : 'BL_Basophils percent (%)',
    'LBDLYMNO' : 'BL_Lymphocyte number (1000 cells/uL)',
    'LBDMONO' : 'BL_Monocyte number (1000 cells/uL)',
    'LBDNENO' : 'BL_Segmented neutrophils num (1000 cell/uL)',
    'LBDEONO' : 'BL_Eosinophils number (1000 cells/uL)',
    'LBDBANO' : 'BL_Basophils number (1000 cells/uL)',
    'LBXRBCSI' : 'BL_Red blood cell count (million cells/uL)',
    'LBXHGB' : 'BL_Hemoglobin (g/dL)',
    'LBXHCT' : 'BL_Hematocrit (%)',
    'LBXMCVSI' : 'BL_Mean cell volume (fL)',
    'LBXMCHSI' : 'BL_Mean cell hemoglobin (pg)',
    'LBXMC' : 'BL_MCHC (g/dL)',
    'LBXRDW' : 'BL_Red cell distribution width (%)',
    'LBXPLTSI' : 'BL_Platelet count (1000 cells/uL)',
    'LBXMPSI' : 'BL_Mean platelet volume (fL)'
    }
    df_blood = df_blood.rename(columns = col_dict)
    # merge df_blood to df
    df = pd.merge(df, df_blood, how='inner', on=['SEQN'])

    df_bio = pd.read_sas('./NHANES 2013-2014/other NHANES/BIOPRO_'+i+'.XPT')
    col_dict = {
    'SEQN' : 'SEQN',
    'LBDSALSI' : 'BIO_Albumin (g/L)',
    #'LBXSAPSI' : 'BIO_Alkaline phosphatase (IU/L)',
    'LBXSASSI' : 'BIO_Aspartate aminotransferase AST (IU/L)',
    'LBXSATSI' : 'BIO_Alanine aminotransferase ALT (IU/L)',
    'LBXSBU' : 'BIO_Blood urea nitrogen (mg/dL)',
    'LBDSBUSI' : 'BIO_Blood urea nitrogen (mmol/L)',
    'LBXSC3SI' : 'BIO_Bicarbonate (mmol/L)',
    'LBDSCASI' : 'BIO_Total calcium (mmol/L)',
    'LBDSCHSI' : 'BIO_Cholesterol (mmol/L)',
    #'LBXSCK' : 'BIO_Creatine Phosphokinase(CPK) (IU/L)',
    'LBXSCLSI' : 'BIO_Chloride (mmol/L)',
    'LBDSCRSI' : 'BIO_Creatinine (umol/L)',
    'LBDSGBSI' : 'BIO_Globulin (g/L)',
    'LBDSGLSI' : 'BIO_Glucose, refrigerated serum (mmol/L)',
    'LBXSGTSI' : 'BIO_Gamma glutamyl transferase (U/L)',
    'LBDSIRSI' : 'BIO_Iron, refrigerated serum (umol/L)',
    'LBXSKSI' : 'BIO_Potassium (mmol/L)',
   # 'LBXSLDSI' : 'BIO_Lactate dehydrogenase (U/L)',
    'LBXSNASI' : 'BIO_Sodium (mmol/L)',
    'LBXSOSSI' : 'BIO_Osmolality (mmol/Kg)',
    'LBDSPHSI' : 'BIO_Phosphorus (mmol/L)',
    'LBDSTBSI' : 'BIO_Total bilirubin (umol/L)',
    'LBDSTPSI' : 'BIO_Total protein (g/L)',
    'LBDSTRSI' : 'BIO_Triglycerides, refrigerated (mmol/L)',
    'LBDSUASI' : 'BIO_Uric acid (umol/L)'}
    # select the columns i want to keep
    df_bio = df_bio[list(col_dict.keys())].rename(columns=col_dict)
    # merge df_bio with df
    df = pd.merge(df, df_bio, how='inner', on=['SEQN'])
    
    df_final = df.copy()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54 entries, 0 to 53
Data columns (total 5 columns):
SEQN             54 non-null float64
Age              54 non-null float64
Diagnosis Age    54 non-null float64
Breast Cancer    54 non-null int64
BMI              53 non-null float64
dtypes: float64(4), int64(1)
memory usage: 2.5 KB


In [3]:
for i in ['B','C','D','E','F','G','H','I']:    
    df_demo= pd.read_sas('./NHANES 2013-2014/other NHANES/DEMO_'+i+'.XPT')
    # selecting specific columns
    # selecting gender: women SEQN and Age
    df_demo = df_demo[(df_demo['RIAGENDR'] == 2 ) & (df_demo['RIDAGEYR'] > 10) ][['SEQN', 'RIDAGEYR']]
    df_demo = df_demo.rename(columns={'RIDAGEYR': 'Age'})

    df_med = pd.read_sas('./NHANES 2013-2014/other NHANES/MCQ_'+i+'.XPT')
    # match only dose SEQN matching the one inf df_demo (seqn variable)
    df_med = df_med.loc[df_med['SEQN'].isin(df_demo['SEQN'])]
    df_med = df_med[['SEQN', 'MCQ230A', 'MCQ240E', 'MCQ220']]
    # add column breast cancer 'Diagnosis' 1 no 0
    df_med['Breast Cancer'] = np.where(df_med['MCQ230A'] == 14,1,0)

    # subset to have women that were diagnosed with breast cancer 
    df_med = df_med[(df_med['Breast Cancer'] == 1)]
    # changing the column name to diagnosis age and saving the dataset
    df_med = df_med.rename(columns={'MCQ240E' : 'Diagnosis Age'})[['SEQN', 'Diagnosis Age', 'Breast Cancer']]
    # merge df_med with df_demo
    df = pd.merge(df_demo, df_med, on='SEQN', how='inner')

    df_bmx = pd.read_sas('./NHANES 2013-2014/other NHANES/BMX_'+i+'.XPT')
    # chose the ones with SEQN from df_med
    df_bmx = df_bmx.loc[df_bmx['SEQN'].isin(df_med['SEQN'].tolist())][['SEQN', 'BMXBMI']]
    # rename the column
    df_bmx = df_bmx.rename(columns={'BMXBMI': 'BMI' })
    # join df_bmx and df_med into one df
    df = pd.merge(df, df_bmx, how='inner', on=['SEQN'])
    df.info()

    df_blood = pd.read_sas('./NHANES 2013-2014/other NHANES/CBC_'+i+'.XPT')
    # subset to SEQN from df
    df_blood = df_blood.loc[df_blood['SEQN'].isin(df['SEQN'])]
    col_dict = {
    'LBXWBCSI' : 'BL_White blood cell count (1000 cells/uL)',
    'LBXLYPCT' : 'BL_Lymphocyte percent (%)',
    'LBXMOPCT' : 'BL_Monocyte percent (%)',
    'LBXNEPCT' : 'BL_Segmented neutrophils percent (%)',
    'LBXEOPCT' : 'BL_Eosinophils percent (%)',
    'LBXBAPCT' : 'BL_Basophils percent (%)',
    'LBDLYMNO' : 'BL_Lymphocyte number (1000 cells/uL)',
    'LBDMONO' : 'BL_Monocyte number (1000 cells/uL)',
    'LBDNENO' : 'BL_Segmented neutrophils num (1000 cell/uL)',
    'LBDEONO' : 'BL_Eosinophils number (1000 cells/uL)',
    'LBDBANO' : 'BL_Basophils number (1000 cells/uL)',
    'LBXRBCSI' : 'BL_Red blood cell count (million cells/uL)',
    'LBXHGB' : 'BL_Hemoglobin (g/dL)',
    'LBXHCT' : 'BL_Hematocrit (%)',
    'LBXMCVSI' : 'BL_Mean cell volume (fL)',
    'LBXMCHSI' : 'BL_Mean cell hemoglobin (pg)',
    'LBXMC' : 'BL_MCHC (g/dL)',
    'LBXRDW' : 'BL_Red cell distribution width (%)',
    'LBXPLTSI' : 'BL_Platelet count (1000 cells/uL)',
    'LBXMPSI' : 'BL_Mean platelet volume (fL)'
    }
    df_blood = df_blood.rename(columns = col_dict)
    # merge df_blood to df
    df = pd.merge(df, df_blood, how='inner', on=['SEQN'])

    df_bio = pd.read_sas('./NHANES 2013-2014/other NHANES/BIOPRO_'+i+'.XPT')
    col_dict = {
    'SEQN' : 'SEQN',
    'LBDSALSI' : 'BIO_Albumin (g/L)',
    #'LBXSAPSI' : 'BIO_Alkaline phosphatase (IU/L)',
    'LBXSASSI' : 'BIO_Aspartate aminotransferase AST (IU/L)',
    'LBXSATSI' : 'BIO_Alanine aminotransferase ALT (IU/L)',
    'LBXSBU' : 'BIO_Blood urea nitrogen (mg/dL)',
    'LBDSBUSI' : 'BIO_Blood urea nitrogen (mmol/L)',
    'LBXSC3SI' : 'BIO_Bicarbonate (mmol/L)',
    'LBDSCASI' : 'BIO_Total calcium (mmol/L)',
    'LBDSCHSI' : 'BIO_Cholesterol (mmol/L)',
    #'LBXSCK' : 'BIO_Creatine Phosphokinase(CPK) (IU/L)',
    'LBXSCLSI' : 'BIO_Chloride (mmol/L)',
    'LBDSCRSI' : 'BIO_Creatinine (umol/L)',
    'LBDSGBSI' : 'BIO_Globulin (g/L)',
    'LBDSGLSI' : 'BIO_Glucose, refrigerated serum (mmol/L)',
    'LBXSGTSI' : 'BIO_Gamma glutamyl transferase (U/L)',
    'LBDSIRSI' : 'BIO_Iron, refrigerated serum (umol/L)',
    'LBXSKSI' : 'BIO_Potassium (mmol/L)',
   # 'LBXSLDSI' : 'BIO_Lactate dehydrogenase (U/L)',
    'LBXSNASI' : 'BIO_Sodium (mmol/L)',
    'LBXSOSSI' : 'BIO_Osmolality (mmol/Kg)',
    'LBDSPHSI' : 'BIO_Phosphorus (mmol/L)',
    'LBDSTBSI' : 'BIO_Total bilirubin (umol/L)',
    'LBDSTPSI' : 'BIO_Total protein (g/L)',
    'LBDSTRSI' : 'BIO_Triglycerides, refrigerated (mmol/L)',
    'LBDSUASI' : 'BIO_Uric acid (umol/L)'}
    # select the columns i want to keep
    df_bio = df_bio[list(col_dict.keys())].rename(columns=col_dict)
    # merge df_bio with df
    df = pd.merge(df, df_bio, how='inner', on=['SEQN'])
    
    df_final = df_final.append(df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 0 to 55
Data columns (total 5 columns):
SEQN             56 non-null float64
Age              56 non-null float64
Diagnosis Age    56 non-null float64
Breast Cancer    56 non-null int64
BMI              51 non-null float64
dtypes: float64(4), int64(1)
memory usage: 2.6 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 62 entries, 0 to 61
Data columns (total 5 columns):
SEQN             62 non-null float64
Age              62 non-null float64
Diagnosis Age    62 non-null float64
Breast Cancer    62 non-null int64
BMI              61 non-null float64
dtypes: float64(4), int64(1)
memory usage: 2.9 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 73 entries, 0 to 72
Data columns (total 5 columns):
SEQN             73 non-null float64
Age              73 non-null float64
Diagnosis Age    73 non-null float64
Breast Cancer    73 non-null int64
BMI              69 non-null float64
dtypes: float64(4), int64(1)
memory usage: 3.

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [4]:
df_final['Breast Cancer'].value_counts()

1    656
Name: Breast Cancer, dtype: int64

# do the same for the other women

In [5]:
for i in ['A']:    
    df_demo= pd.read_sas('./NHANES 2013-2014/other NHANES/DEMO_'+i+'.XPT')
    # selecting specific columns
    # selecting gender: women SEQN and Age
    df_demo = df_demo[(df_demo['RIAGENDR'] == 2 ) & (df_demo['RIDAGEYR'] > 10) ][['SEQN', 'RIDAGEYR']]
    df_demo = df_demo.rename(columns={'RIDAGEYR': 'Age'})

    df_med = pd.read_sas('./NHANES 2013-2014/other NHANES/MCQ_'+i+'.XPT')
    # match only dose SEQN matching the one inf df_demo (seqn variable)
    df_med = df_med.loc[df_med['SEQN'].isin(df_demo['SEQN'])]
    df_med = df_med[['SEQN', 'MCQ230A', 'MCQ240E', 'MCQ220']]
    # add column breast cancer 'Diagnosis' 1 no 0
    df_med['Breast Cancer'] = np.where(df_med['MCQ230A'] == 14,1,0)

    # subset to have women that were diagnosed with breast cancer 
    df_med = df_med[df_med['Breast Cancer'] == 0]
    # changing the column name to diagnosis age and saving the dataset
    df_med = df_med.rename(columns={'MCQ240E' : 'Diagnosis Age'})[['SEQN', 'Diagnosis Age', 'Breast Cancer']]
    # merge df_med with df_demo
    df = pd.merge(df_demo, df_med, on='SEQN', how='inner')

    df_bmx = pd.read_sas('./NHANES 2013-2014/other NHANES/BMX_'+i+'.XPT')
    # chose the ones with SEQN from df_med
    df_bmx = df_bmx.loc[df_bmx['SEQN'].isin(df_med['SEQN'].tolist())][['SEQN', 'BMXBMI']]
    # rename the column
    df_bmx = df_bmx.rename(columns={'BMXBMI': 'BMI' })
    # join df_bmx and df_med into one df
    df = pd.merge(df, df_bmx, how='inner', on=['SEQN'])
    df.info()

    df_blood = pd.read_sas('./NHANES 2013-2014/other NHANES/CBC_'+i+'.XPT')
    # subset to SEQN from df
    df_blood = df_blood.loc[df_blood['SEQN'].isin(df['SEQN'])]
    col_dict = {
    'LBXWBCSI' : 'BL_White blood cell count (1000 cells/uL)',
    'LBXLYPCT' : 'BL_Lymphocyte percent (%)',
    'LBXMOPCT' : 'BL_Monocyte percent (%)',
    'LBXNEPCT' : 'BL_Segmented neutrophils percent (%)',
    'LBXEOPCT' : 'BL_Eosinophils percent (%)',
    'LBXBAPCT' : 'BL_Basophils percent (%)',
    'LBDLYMNO' : 'BL_Lymphocyte number (1000 cells/uL)',
    'LBDMONO' : 'BL_Monocyte number (1000 cells/uL)',
    'LBDNENO' : 'BL_Segmented neutrophils num (1000 cell/uL)',
    'LBDEONO' : 'BL_Eosinophils number (1000 cells/uL)',
    'LBDBANO' : 'BL_Basophils number (1000 cells/uL)',
    'LBXRBCSI' : 'BL_Red blood cell count (million cells/uL)',
    'LBXHGB' : 'BL_Hemoglobin (g/dL)',
    'LBXHCT' : 'BL_Hematocrit (%)',
    'LBXMCVSI' : 'BL_Mean cell volume (fL)',
    'LBXMCHSI' : 'BL_Mean cell hemoglobin (pg)',
    'LBXMC' : 'BL_MCHC (g/dL)',
    'LBXRDW' : 'BL_Red cell distribution width (%)',
    'LBXPLTSI' : 'BL_Platelet count (1000 cells/uL)',
    'LBXMPSI' : 'BL_Mean platelet volume (fL)'
    }
    df_blood = df_blood.rename(columns = col_dict)
    # merge df_blood to df
    df = pd.merge(df, df_blood, how='inner', on=['SEQN'])

    df_bio = pd.read_sas('./NHANES 2013-2014/other NHANES/BIOPRO_'+i+'.XPT')
    col_dict = {
    'SEQN' : 'SEQN',
    'LBDSALSI' : 'BIO_Albumin (g/L)',
    #'LBXSAPSI' : 'BIO_Alkaline phosphatase (IU/L)',
    'LBXSASSI' : 'BIO_Aspartate aminotransferase AST (IU/L)',
    'LBXSATSI' : 'BIO_Alanine aminotransferase ALT (IU/L)',
    'LBXSBU' : 'BIO_Blood urea nitrogen (mg/dL)',
    'LBDSBUSI' : 'BIO_Blood urea nitrogen (mmol/L)',
    'LBXSC3SI' : 'BIO_Bicarbonate (mmol/L)',
    'LBDSCASI' : 'BIO_Total calcium (mmol/L)',
    'LBDSCHSI' : 'BIO_Cholesterol (mmol/L)',
    #'LBXSCK' : 'BIO_Creatine Phosphokinase(CPK) (IU/L)',
    'LBXSCLSI' : 'BIO_Chloride (mmol/L)',
    'LBDSCRSI' : 'BIO_Creatinine (umol/L)',
    'LBDSGBSI' : 'BIO_Globulin (g/L)',
    'LBDSGLSI' : 'BIO_Glucose, refrigerated serum (mmol/L)',
    'LBXSGTSI' : 'BIO_Gamma glutamyl transferase (U/L)',
    'LBDSIRSI' : 'BIO_Iron, refrigerated serum (umol/L)',
    'LBXSKSI' : 'BIO_Potassium (mmol/L)',
   # 'LBXSLDSI' : 'BIO_Lactate dehydrogenase (U/L)',
    'LBXSNASI' : 'BIO_Sodium (mmol/L)',
    'LBXSOSSI' : 'BIO_Osmolality (mmol/Kg)',
    'LBDSPHSI' : 'BIO_Phosphorus (mmol/L)',
    'LBDSTBSI' : 'BIO_Total bilirubin (umol/L)',
    'LBDSTPSI' : 'BIO_Total protein (g/L)',
    'LBDSTRSI' : 'BIO_Triglycerides, refrigerated (mmol/L)',
    'LBDSUASI' : 'BIO_Uric acid (umol/L)'}
    # select the columns i want to keep
    df_bio = df_bio[list(col_dict.keys())].rename(columns=col_dict)
    # merge df_bio with df
    df = pd.merge(df, df_bio, how='inner', on=['SEQN'])
    
    df_final_h = df.copy()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3561 entries, 0 to 3560
Data columns (total 5 columns):
SEQN             3561 non-null float64
Age              3561 non-null float64
Diagnosis Age    0 non-null float64
Breast Cancer    3561 non-null int64
BMI              3515 non-null float64
dtypes: float64(4), int64(1)
memory usage: 166.9 KB


In [6]:
for i in ['B','C','D','E','F','G','H','I']:    
    df_demo= pd.read_sas('./NHANES 2013-2014/other NHANES/DEMO_'+i+'.XPT')
    # selecting specific columns
    # selecting gender: women SEQN and Age
    df_demo = df_demo[(df_demo['RIAGENDR'] == 2 ) & (df_demo['RIDAGEYR'] > 10) ][['SEQN', 'RIDAGEYR']]
    df_demo = df_demo.rename(columns={'RIDAGEYR': 'Age'})

    df_med = pd.read_sas('./NHANES 2013-2014/other NHANES/MCQ_'+i+'.XPT')
    # match only dose SEQN matching the one inf df_demo (seqn variable)
    df_med = df_med.loc[df_med['SEQN'].isin(df_demo['SEQN'])]
    df_med = df_med[['SEQN', 'MCQ230A', 'MCQ240E', 'MCQ220']]
    # add column breast cancer 'Diagnosis' 1 no 0
    df_med['Breast Cancer'] = np.where(df_med['MCQ230A'] == 14,1,0)

    # subset to have women that were diagnosed with breast cancer 
    df_med = df_med[df_med['Breast Cancer'] == 0]
    # changing the column name to diagnosis age and saving the dataset
    df_med = df_med.rename(columns={'MCQ240E' : 'Diagnosis Age'})[['SEQN', 'Diagnosis Age', 'Breast Cancer']]
    # merge df_med with df_demo
    df = pd.merge(df_demo, df_med, on='SEQN', how='inner')

    df_bmx = pd.read_sas('./NHANES 2013-2014/other NHANES/BMX_'+i+'.XPT')
    # chose the ones with SEQN from df_med
    df_bmx = df_bmx.loc[df_bmx['SEQN'].isin(df_med['SEQN'].tolist())][['SEQN', 'BMXBMI']]
    # rename the column
    df_bmx = df_bmx.rename(columns={'BMXBMI': 'BMI' })
    # join df_bmx and df_med into one df
    df = pd.merge(df, df_bmx, how='inner', on=['SEQN'])
    df.info()

    df_blood = pd.read_sas('./NHANES 2013-2014/other NHANES/CBC_'+i+'.XPT')
    # subset to SEQN from df
    df_blood = df_blood.loc[df_blood['SEQN'].isin(df['SEQN'])]
    col_dict = {
    'LBXWBCSI' : 'BL_White blood cell count (1000 cells/uL)',
    'LBXLYPCT' : 'BL_Lymphocyte percent (%)',
    'LBXMOPCT' : 'BL_Monocyte percent (%)',
    'LBXNEPCT' : 'BL_Segmented neutrophils percent (%)',
    'LBXEOPCT' : 'BL_Eosinophils percent (%)',
    'LBXBAPCT' : 'BL_Basophils percent (%)',
    'LBDLYMNO' : 'BL_Lymphocyte number (1000 cells/uL)',
    'LBDMONO' : 'BL_Monocyte number (1000 cells/uL)',
    'LBDNENO' : 'BL_Segmented neutrophils num (1000 cell/uL)',
    'LBDEONO' : 'BL_Eosinophils number (1000 cells/uL)',
    'LBDBANO' : 'BL_Basophils number (1000 cells/uL)',
    'LBXRBCSI' : 'BL_Red blood cell count (million cells/uL)',
    'LBXHGB' : 'BL_Hemoglobin (g/dL)',
    'LBXHCT' : 'BL_Hematocrit (%)',
    'LBXMCVSI' : 'BL_Mean cell volume (fL)',
    'LBXMCHSI' : 'BL_Mean cell hemoglobin (pg)',
    'LBXMC' : 'BL_MCHC (g/dL)',
    'LBXRDW' : 'BL_Red cell distribution width (%)',
    'LBXPLTSI' : 'BL_Platelet count (1000 cells/uL)',
    'LBXMPSI' : 'BL_Mean platelet volume (fL)'
    }
    df_blood = df_blood.rename(columns = col_dict)
    # merge df_blood to df
    df = pd.merge(df, df_blood, how='inner', on=['SEQN'])

    df_bio = pd.read_sas('./NHANES 2013-2014/other NHANES/BIOPRO_'+i+'.XPT')
    col_dict = {
    'SEQN' : 'SEQN',
    'LBDSALSI' : 'BIO_Albumin (g/L)',
    #'LBXSAPSI' : 'BIO_Alkaline phosphatase (IU/L)',
    'LBXSASSI' : 'BIO_Aspartate aminotransferase AST (IU/L)',
    'LBXSATSI' : 'BIO_Alanine aminotransferase ALT (IU/L)',
    'LBXSBU' : 'BIO_Blood urea nitrogen (mg/dL)',
    'LBDSBUSI' : 'BIO_Blood urea nitrogen (mmol/L)',
    'LBXSC3SI' : 'BIO_Bicarbonate (mmol/L)',
    'LBDSCASI' : 'BIO_Total calcium (mmol/L)',
    'LBDSCHSI' : 'BIO_Cholesterol (mmol/L)',
    #'LBXSCK' : 'BIO_Creatine Phosphokinase(CPK) (IU/L)',
    'LBXSCLSI' : 'BIO_Chloride (mmol/L)',
    'LBDSCRSI' : 'BIO_Creatinine (umol/L)',
    'LBDSGBSI' : 'BIO_Globulin (g/L)',
    'LBDSGLSI' : 'BIO_Glucose, refrigerated serum (mmol/L)',
    'LBXSGTSI' : 'BIO_Gamma glutamyl transferase (U/L)',
    'LBDSIRSI' : 'BIO_Iron, refrigerated serum (umol/L)',
    'LBXSKSI' : 'BIO_Potassium (mmol/L)',
   # 'LBXSLDSI' : 'BIO_Lactate dehydrogenase (U/L)',
    'LBXSNASI' : 'BIO_Sodium (mmol/L)',
    'LBXSOSSI' : 'BIO_Osmolality (mmol/Kg)',
    'LBDSPHSI' : 'BIO_Phosphorus (mmol/L)',
    'LBDSTBSI' : 'BIO_Total bilirubin (umol/L)',
    'LBDSTPSI' : 'BIO_Total protein (g/L)',
    'LBDSTRSI' : 'BIO_Triglycerides, refrigerated (mmol/L)',
    'LBDSUASI' : 'BIO_Uric acid (umol/L)'}
    # select the columns i want to keep
    df_bio = df_bio[list(col_dict.keys())].rename(columns=col_dict)
    # merge df_bio with df
    df = pd.merge(df, df_bio, how='inner', on=['SEQN'])
    
    df_final_h = df_final.append(df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3915 entries, 0 to 3914
Data columns (total 5 columns):
SEQN             3915 non-null float64
Age              3915 non-null float64
Diagnosis Age    3 non-null float64
Breast Cancer    3915 non-null int64
BMI              3709 non-null float64
dtypes: float64(4), int64(1)
memory usage: 183.5 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3589 entries, 0 to 3588
Data columns (total 5 columns):
SEQN             3589 non-null float64
Age              3589 non-null float64
Diagnosis Age    1 non-null float64
Breast Cancer    3589 non-null int64
BMI              3516 non-null float64
dtypes: float64(4), int64(1)
memory usage: 168.2 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3615 entries, 0 to 3614
Data columns (total 5 columns):
SEQN             3615 non-null float64
Age              3615 non-null float64
Diagnosis Age    1 non-null float64
Breast Cancer    3615 non-null int64
BMI              3562 non-null float64
dtypes: 

In [7]:
df_final_h['Breast Cancer'].value_counts()

0    3389
1     656
Name: Breast Cancer, dtype: int64

In [8]:
df_final_final = df_final.append(df_final_h)

In [9]:
df_final_final['Breast Cancer'].value_counts()
df_final_final[df_final_final.duplicated()]['Breast Cancer'].value_counts()

1    656
Name: Breast Cancer, dtype: int64

# Clean up

In [10]:
df = df_final_final.copy()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4701 entries, 0 to 3388
Data columns (total 48 columns):
Age                                            4701 non-null float64
BIO_Alanine aminotransferase ALT (IU/L)        4268 non-null float64
BIO_Albumin (g/L)                              4267 non-null float64
BIO_Aspartate aminotransferase AST (IU/L)      4268 non-null float64
BIO_Bicarbonate (mmol/L)                       4268 non-null float64
BIO_Blood urea nitrogen (mg/dL)                4266 non-null float64
BIO_Blood urea nitrogen (mmol/L)               4266 non-null float64
BIO_Chloride (mmol/L)                          4268 non-null float64
BIO_Cholesterol (mmol/L)                       4266 non-null float64
BIO_Creatinine (umol/L)                        4266 non-null float64
BIO_Gamma glutamyl transferase (U/L)           4268 non-null float64
BIO_Globulin (g/L)                             4267 non-null float64
BIO_Glucose, refrigerated serum (mmol/L)       4268 non-null floa

In [12]:
#LBXMCH has only null values, therefore it will be dropped
df = df.drop(columns=['LBXMCH' , 'Diagnosis Age'])

In [13]:
# handle missing values
df = df.dropna(thresh = 45)

In [14]:
# clean column names
df.columns = df.columns.str.lower().str.capitalize()

In [15]:
#rename the Seqn column
df = df.rename(columns={'Seqn':'Id'})

In [16]:
# control for duplicates
df[df.duplicated()]['Breast cancer'].value_counts()

1    560
Name: Breast cancer, dtype: int64

In [17]:
# there has to be something off with the code COME BACK TO THIS AND FIX IT LATER
df = df.drop_duplicates()

In [18]:
df['Breast cancer'].value_counts()

0    3087
1     560
Name: Breast cancer, dtype: int64

In this dataset, 560 women have been diagnosed with cancer. I will randomly choose 78 healthy women to have groups of comparable sizes

In [19]:
df_healthy = df.loc[df['Breast cancer'] == 0]
df_healthy = df_healthy.sample(n=560)

In [20]:
df_final = df_healthy.append(df[df['Breast cancer']==1])

In [21]:
df_final = df_final.sort_values(by=['Id'])

In [23]:
df_final.duplicated().sum()

0

In [25]:
df_final.to_csv('nhanes_breast_cancer_all_nhanes.csv', index=False)