# `NHAINESStat` Populator

## Step 1: Set-Up

In [None]:
# Imports
import uuid
from confluent_kafka.admin import AdminClient, NewTopic
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql.types import FloatType
from pyspark.sql.types import StringType
import pandas as pd

# Get config
from config import user
from config import password

In [None]:
# Mount point through Oauth security.
storageAccount = "gen10datafund2205"
storageContainer = "group5container"
clientSecret = "-ZS8Q~NwOKfwEpVOg3Teb1pPtxDbz616XjlXLbuU"
clientid = "2ca50102-5717-4373-b796-39d06568588d"
mount_point = "/mnt/jacklynn/nhaines" 

# Configuration dictionary
configs = {"fs.azure.account.auth.type": "OAuth",
       "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
       "fs.azure.account.oauth2.client.id": clientid,
       "fs.azure.account.oauth2.client.secret": clientSecret,
       "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
       "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

# Unmount if exists
try: 
    dbutils.fs.unmount(mount_point)
except:
    pass

# Mount to database
dbutils.fs.mount(
    source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs)

# Table variables
database = "group5database"
server = "gen10-data-fundamentals-22-05-sql-server.database.windows.net"
port = "1433"

In [None]:
%fs 
ls /mnt/jacklynn/nhaines

path,name,size,modificationTime
dbfs:/mnt/jacklynn/nhaines/CGM_Data.csv,CGM_Data.csv,35978185,1659468631000
dbfs:/mnt/jacklynn/nhaines/Diabetes Prevalence in the US by State and Demographic.csv,Diabetes Prevalence in the US by State and Demographic.csv,180068,1659497875000
dbfs:/mnt/jacklynn/nhaines/Education by state.csv,Education by state.csv,3516,1659576860000
dbfs:/mnt/jacklynn/nhaines/ExerciseData_2013_150min.csv,ExerciseData_2013_150min.csv,1544,1659645073000
dbfs:/mnt/jacklynn/nhaines/Food Insecurity.csv,Food Insecurity.csv,6779,1659533925000
dbfs:/mnt/jacklynn/nhaines/Income Brackets by State.csv,Income Brackets by State.csv,4675,1659578726000
dbfs:/mnt/jacklynn/nhaines/U.S. NHANES Survey Data.csv,U.S. NHANES Survey Data.csv,1005266,1659710519000
dbfs:/mnt/jacklynn/nhaines/chinese-diabetes-clean.csv,chinese-diabetes-clean.csv,33367142,1659541878000


In [None]:
# Helper function: read in table
def readInTable(table_name):
    df = spark.read.format("jdbc") \
        .option("url", f"jdbc:sqlserver://{server}:{port};databaseName={database};") \
        .option("dbtable", table_name).option("user", user).option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    return df

# Helper function: read in file
def readInFile(f):
    df = spark.read.options(
        inferSchema='True',
        delimiter=',',
        header='True'
        ).csv(f)
    return df

# Helper function: write in table
def saveToTable(df, table, change='append'):
    df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
                .mode(change) \
                .option("dbtable", table) \
                .option("user", user) \
                .option("password", password) \
                .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
                .save()

# Helper function: convert table into dictionary converter
def formDictConverter(table, key, value):
    df = readInTable(table)
    converter = dict()
    data = df.select([key, value]).distinct().toPandas()[[key, value]]
    keys = data[key].to_list()
    for this_key in keys:
        converter[this_key] = data.loc[data[key] == this_key][value].to_list()[0]
    return converter

## Part 2: Populate `Demographic` Database

In [None]:
# Get starting demoID
table_demo = "dbo.Demographic"
df_demo = readInTable(table_demo)
j = df_demo.agg({"demoID": "max"}).collect()[0]['max(demoID)'] + 1

# Write function for adding states data
def addDemographics():
    
    # Useful variable
    schema = ['demoID', 'demo_group', 'category']
    table_demo = "dbo.Demographic"
    
    # Helper function
    
    # Helper function: create category
    def createCategory(group, category):
        global j
        demoID = j
        j += 1
        return [demoID, group, category]
    
    # Helper function: remove duplicates
    def dedup(starting_list):
        used_list = readInTable("dbo.Demographic").select(['demo_group']).toPandas()['demo_group'].to_list()
        for demo in starting_list:
            if demo in used_list or demo.capitalize() in used_list or demo.lower() in used_list:
                starting_list.remove(demo)
        return starting_list
    
    # Get diabetes prevalence demographics
    df_nhaines = readInFile('/mnt/jacklynn/nhaines/U.S. NHANES Survey Data.csv')
    df_nhaines = df_nhaines.select(['AdultEducationLevel', 'AnnualHouseholdIncome', 'Ethnicity'])
    
    # Get drinking demos
    drinking_demos = ['12+ alc. drinks/yr', '<12 alc. drinks/yr']
    drinking_demos = dedup(drinking_demos)
    
    # Only if values exist save
    if (len(drinking_demos) != 0):
    
        # Add all of demos under 'race/ethnicity' category
        drinking_demos_map = map(lambda x: createCategory(x, 'drinking (NHAINES)'), drinking_demos)
        df_drinking_demos = spark.createDataFrame(data = drinking_demos_map, schema = schema)

        # Save to table
        saveToTable(df_drinking_demos, table_demo)
    
    # Get smoking demos
    smoking_demos = ['Smoked 100+ cigs.', 'Smoked <100 cigs.']
    smoking_demos = dedup(smoking_demos)
    
    # Only if values exist save
    if (len(smoking_demos) != 0):
    
        # Add all of demos under 'race/ethnicity' category
        smoking_demos_map = map(lambda x: createCategory(x, 'smoking (NHAINES)'), smoking_demos)
        df_smoking_demos = spark.createDataFrame(data = smoking_demos_map, schema = schema)

        # Save to table
        saveToTable(df_smoking_demos, table_demo)
    
    # Get education demos
    
    # Get categories
    education_demos = df_nhaines.select('AdultEducationLevel').distinct().toPandas()['AdultEducationLevel'].to_list()
    
    # Iterate through each one and remove those that are already in list
    education_demos = dedup(education_demos)
    
    # Only if values exist save
    if (len(education_demos) != 0):
    
        # Add all of demos under 'race/ethnicity' category
        education_demos_map = map(lambda x: createCategory(x, 'education level'), education_demos)
        df_education_demos = spark.createDataFrame(data = education_demos_map, schema = schema)

        # Save to table
        saveToTable(df_education_demos, table_demo)
    
    # Get income demos
    
    # Get categories
    income_demos = df_nhaines.select('AnnualHouseholdIncome').distinct().toPandas()['AnnualHouseholdIncome'].to_list()
    
    # Iterate through each one and remove those that are already in list
    income_demos = dedup(income_demos)
    
    # Only if values exist save
    if (len(income_demos) != 0):
    
        # Add all of demos under 'race/ethnicity' category
        income_demos_map = map(lambda x: createCategory(x, 'income bracket'), income_demos)
        df_income_demos = spark.createDataFrame(data = income_demos_map, schema = schema)

        # Save to table
        saveToTable(df_income_demos, table_demo)
        
    # Get income demos
    
    # Get categories
    ethinicity_demos = df_nhaines.select('Ethnicity').distinct().toPandas()['Ethnicity'].to_list()
    
    # Iterate through each one and remove those that are already in list
    ethinicity_demos = dedup(ethinicity_demos)
    
    # Only if values exist save
    if (len(ethinicity_demos) != 0):
    
        # Add all of demos under 'race/ethnicity' category
        ethinicity_demos_map = map(lambda x: createCategory(x, 'race/ethnicity'), ethinicity_demos)
        df_ethnicity_demos = spark.createDataFrame(data = ethinicity_demos_map, schema = schema)

        # Save to table
        saveToTable(df_ethnicity_demos, table_demo)
    
    # Return dictionary with states conversions
    return formDictConverter(table_demo, 'demo_group', 'demoID')

In [None]:
# Add all demos to database
demosToID = addDemographics()

# Add in uppercase versions of 'Female' and 'Male'
demosToID['Female'] = demosToID['female']
demosToID['Male'] = demosToID['male']

## Part 3: Populate `NHAINESStat` Database

In [None]:
# Helper function: rename tables
def renameCols(df, prev_cols, new_cols):
    for i in range(len(prev_cols)):
        df = df.withColumnRenamed(prev_cols[i], new_cols[i])
    return df

# Helper function: drop columns
def dropCols(df, drop_cols):
    for this_cols in drop_cols:
        try:
            df = df.drop(col(this_cols))
        except:
            df = df.drop(this_cols)
    return df

# Helper function: add columns with all the same values
def addCols(df, colNames, addValues):
    for i in range(len(colNames)):
        df = df.withColumn(colNames[i], lit(addValues[i]))
    return df

# Helper function: convert demo to demoID
def catToID(df, merge_val, cat_cols, dictionary):
    df_replace = df.select(cat_cols + merge_val).toPandas()
    for cat in cat_cols:
        df_replace =  df_replace.replace({cat: dictionary})
        try:
            df = df.drop(col(cat))
        except:
            df = df.drop(cat)
    df_replace = spark.createDataFrame(df_replace)
    df = df.join(df_replace, on=merge_val)
    return df

In [None]:
# Get diabetes prevalence demographics
df_nhaines = readInFile('/mnt/jacklynn/nhaines/U.S. NHANES Survey Data.csv')

# Rename column names
orig_cols = ['SurveyID', 'SystolicBP', 'DiastolicBP', '12drinksInaYear', 'BeenDiagnostedHypertenisve',
            'MultipleHypertensionDiagnosis', 'DiagnosedDiabetic', 'DiagnosedPrediabetic',
            'DiagnosedAtRiskDiabetes', 'NumMealsNotAtHomePerMonth', 'FamilyMonthlyPovertLevel', 
            'Smoked100cigs', 'Height', 'Weight', 'Gender', 'Age(yrs)', 'Ethnicity', 'AdultEducationLevel',
            'AnnualHouseholdIncome', 'BMI']
new_cols = ['nhainesID', 'sbp', 'dbp', 'drinkerID', 'hypertensive', 'multihypertensive', 'diabetes', 
           'prediabetes', 'diabetesRisk', 'mealsAtHome', 'familyMonthlyPoverty', 'smokerID', 'height',
           'weight', 'sexID', 'age', 'ethnicityID', 'educationID', 'incomeID', 'bmi']
df_nhaines = renameCols(df_nhaines, orig_cols, new_cols)

# Drop columns that aren't needed
drop_cols = ['_c0', 'DrinksInLastYear', 'CurrentSmoker']
df_nhaines = dropCols(df_nhaines, drop_cols)

# Fix drinkerID column
drinkerDict = dict()
drinkerDict[True] = demosToID['12+ alc. drinks/yr']
drinkerDict[False] = demosToID['<12 alc. drinks/yr']
df_nhaines = catToID(df_nhaines, ['nhainesID'], ['drinkerID'], drinkerDict)

# Fix smokerID column
smokerDict = dict()
smokerDict[True] = demosToID['Smoked 100+ cigs.']
smokerDict[False] = demosToID['Smoked <100 cigs.']
df_nhaines = catToID(df_nhaines, ['nhainesID'], ['smokerID'], smokerDict)

# Fix sexID column
df_nhaines = catToID(df_nhaines, ['nhainesID'], ['sexID', 'incomeID', 'educationID', 'ethnicityID'], demosToID)

# Save file
saveToTable(df_nhaines, 'dbo.NHAINESStat', change='overwrite')