# `DiabetesPop` Populator

## Part 1: Set-Up

In [None]:
# Imports
import uuid
from confluent_kafka.admin import AdminClient, NewTopic
from pyspark.sql.functions import col

# Get config
from config import user
from config import password

In [None]:
# Mount point through Oauth security.
storageAccount = "gen10datafund2205"
storageContainer = "group5container"
clientSecret = "-ZS8Q~NwOKfwEpVOg3Teb1pPtxDbz616XjlXLbuU"
clientid = "2ca50102-5717-4373-b796-39d06568588d"
mount_point = "/mnt/jacklynn/chinese" 

configs = {"fs.azure.account.auth.type": "OAuth",
       "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
       "fs.azure.account.oauth2.client.id": clientid,
       "fs.azure.account.oauth2.client.secret": clientSecret,
       "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
       "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

# Unmount if exists
try: 
    dbutils.fs.unmount(mount_point)
except:
    pass

# Mount to database
dbutils.fs.mount(
    source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs)

In [None]:
%fs 
ls /mnt/jacklynn/chinese

path,name,size,modificationTime
dbfs:/mnt/jacklynn/chinese/CGM_Data.csv,CGM_Data.csv,35978185,1659468631000
dbfs:/mnt/jacklynn/chinese/Diabetes Prevalence in the US by State and Demographic.csv,Diabetes Prevalence in the US by State and Demographic.csv,180068,1659497875000
dbfs:/mnt/jacklynn/chinese/Education by state.csv,Education by state.csv,3516,1659576860000
dbfs:/mnt/jacklynn/chinese/ExerciseData_2013_150min.csv,ExerciseData_2013_150min.csv,1544,1659645073000
dbfs:/mnt/jacklynn/chinese/Food Insecurity.csv,Food Insecurity.csv,6779,1659533925000
dbfs:/mnt/jacklynn/chinese/Income Brackets by State.csv,Income Brackets by State.csv,4675,1659578726000
dbfs:/mnt/jacklynn/chinese/U.S. NHANES Survey Data.csv,U.S. NHANES Survey Data.csv,1005266,1659710519000
dbfs:/mnt/jacklynn/chinese/chinese-diabetes-clean.csv,chinese-diabetes-clean.csv,33367142,1659541878000


In [None]:
# Get data
df = spark.read.options(
    inferSchema='True',
    delimiter=',',
    header='True'
    ).csv('/mnt/jacklynn/chinese/chinese-diabetes-clean.csv')

display(df)

_c0,age,sex,height,weight,bmi,sbp,dbp,fpg,chol,tg,hdlc,ldl,alt,ast,bun,ccr,fpg_final,diabetes,smoker,drinker,fam_hist
0,43,female,166.4,53.5,19.3,96.0,57.0,4.99,5.13,0.78,1.3724964084811768,2.7631269860458687,10.0,23.99254234510529,3.08,50.3,4.97,no,never smoker,never drinker,family history of diabetes
1,34,male,169.0,57.0,20.0,124.0,69.0,3.51,4.61,1.75,1.09,3.13,29.1,23.99254234510529,6.13,83.7,5.5,no,no info,no info,no family history of diabetes
2,32,female,157.0,51.0,20.7,98.0,68.0,4.25,4.73,0.47,1.3724964084811768,2.7631269860458687,6.9,19.5,4.45,42.8,4.9,no,no info,no info,no family history of diabetes
3,59,male,165.0,63.0,23.1,136.0,73.0,5.7,4.5,0.75,1.53,2.8,15.0,23.99254234510529,5.76,62.6,5.5,no,never smoker,never drinker,no family history of diabetes
4,30,female,163.5,48.5,18.1,107.0,76.0,4.42,3.48,0.52,1.55,1.43,12.0,19.9,2.51,48.9,4.82,no,no info,no info,no family history of diabetes
5,41,female,157.0,57.9,23.5,119.0,83.0,4.53,3.7,1.39,1.3724964084811768,2.7631269860458687,34.0,23.0,3.85,53.0,5.0,no,no info,no info,no family history of diabetes
6,51,female,170.0,61.8,21.4,123.0,66.0,4.6,4.77,1.29,1.3724964084811768,2.7631269860458687,12.1,13.2,3.1,41.2,4.8,no,no info,no info,no family history of diabetes
7,36,male,166.0,66.7,24.2,111.0,67.0,4.69,4.75,1.02,1.2,3.09,24.6,23.99254234510529,5.95,90.7,5.32,no,no info,no info,no family history of diabetes
8,31,male,167.0,61.5,22.1,130.0,61.0,4.7,3.93,0.76,1.26,2.21,46.0,23.99254234510529,8.3,75.5,4.8,no,no info,no info,no family history of diabetes
9,31,male,161.5,58.5,22.4,104.0,74.0,5.78,4.48,0.78,1.3724964084811768,2.7631269860458687,23.6,23.99254234510529,6.82,96.5,5.1,no,never smoker,never drinker,no family history of diabetes


## Part 2: Populate `Demographic` Table

In [None]:
# Create function that quickly formats the category data
def createDataCategories(df_i, cols, cats, data):
    j = 0
    for i in range(len(cols)):
        col = cols[i]
        cat = cats[i]
        groups = df_i.select(col).distinct().toPandas()[col].to_list()
        for group in groups:
            datum = dict()
            datum['demoID'] = j
            j += 1
            datum['demo_group'] = group
            datum['category'] = cat
            data.append(datum)

# Create the new categories for demographic table
data = []
createDataCategories(df, ['sex', 'smoker', 'drinker', 'fam_hist'], ['sex', 'smoker', 'drinker', 'family history'], data)
df_cat = spark.createDataFrame(data)
display(df_cat)

category,demoID,demo_group
sex,0,female
sex,1,male
smoker,2,former smoker
smoker,3,never smoker
smoker,4,current smoker
smoker,5,no info
drinker,6,current drinker
drinker,7,never drinker
drinker,8,no info
drinker,9,former drinker


In [None]:
# Save demographic data to Demographic SQL table
database = "group5database"
table_cat = "dbo.Demographic"
user = "jacklynn"
password  = 'Peanut-Hazel-Tails-1500-Cat!!'
server = "gen10-data-fundamentals-22-05-sql-server.database.windows.net"
port = "1433"
df_cat.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
            .mode("overwrite") \
            .option("dbtable", table_cat) \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .save()

## Part 3: Populate `DiabetesPop` Table

In [None]:
# Create a dictionary to quickly convert demographic groups to ids
convertDemoToID = dict()
data = df_cat.select(['demo_group', 'demoID']).distinct().toPandas()[['demo_group', 'demoID']]
groups = data['demo_group'].to_list()
for group in groups:
    convertDemoToID[group] = data.loc[data['demo_group'] == group]['demoID'].to_list()[0]
print(convertDemoToID)

# Format the DataFrame that reference other tables
df_replace = df.select(['_c0', 'sex', 'smoker', 'drinker', 'fam_hist']).toPandas()
for category in ['sex', 'smoker', 'drinker', 'fam_hist']:
    df_replace =  df_replace.replace({category: convertDemoToID})
    df = df.drop(col(category))
df_replace = spark.createDataFrame(df_replace)
df = df.join(df_replace, on=['_c0'])
df = df.withColumnRenamed('_c0', 'personID')

In [None]:
# Save to database
table = "dbo.DiabetesPop"
df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
            .mode("overwrite") \
            .option("dbtable", table) \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .save()