# Exploratory Data Analysis

Youth Risk Behavior Survey (YRBS) data has been converted from fixed-width-format, to comma separated values.  The dataset was split into two by state name, A-M and N-Z. In this notebook, we will merge the two together and perform some basic cleanup and initial exploration.


In [None]:
import sys
import time
import pandas as pd
import numpy as np
from pathlib import Path
from sqlalchemy import inspect, create_engine
import hvplot.pandas

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Set the file paths and read data

Download the converted CSV files from here: https://drive.google.com/file/d/11KfEMD5GrSDqIwB6jURa45W0fX2yezXF/view?usp=sharing

Then extract the contents of the zip file into the `data` directory. You should have four csv files:

        <project root>
        |-- LICENSE
        |-- README.md
        '-- data
            |-- SADC_a_m_Q.csv
            |-- SADC_a_m_QN.csv
            |-- SADC_n_z_Q.csv
            |-- SADC_n_z_QN.csv
            '-- cdc_yrbss_state_data.zip
 


In [None]:
state_a_m_datafile = Path("data/SADC_a_m_Q.csv")
state_n_z_datafile = Path("data/SADC_n_z_Q.csv")

In [None]:
start_time = time.time()
state_a_m_df = pd.read_csv(state_a_m_datafile)
state_n_z_df = pd.read_csv(state_n_z_datafile)
current_time = time.time()
elapsed_time = current_time - start_time
print(f"duration: {elapsed_time}")

In [None]:
print(f"A-M Shape: {state_a_m_df.shape}")
print(f"N-Z Shape: {state_n_z_df.shape}\n\n")
print("Heads:")
display(state_a_m_df.head(1))
display(state_n_z_df.head(1))
print("Tails:")
display(state_a_m_df.tail(1))
display(state_n_z_df.tail(1))

## Transform

### Start by concatenating the DataFrames

In [None]:
# Concatenate dataframes
state_df = pd.concat([state_a_m_df, state_n_z_df])
# print(f"summery_df Shape: {state_df.shape}\n\n")
# print("Head:")
# display(state_df.head(3))
# print("Tail:")
# display(state_df.tail(3))

Trim to 1st 27 columns

In [None]:
# cols = ["sitecode", "sitename", "sitetype", "sitetypenum", "year", "survyear", "weight", "stratum", "PSU", "record",
# "age", "sex", "grade", "race4", "race7", "stheight", "stweight", "bmi", "bmipct", "qnobese", "qnowt", "q66", 
# "q65", "sexid", "sexid2", "sexpart", "sexpart2"]

# state_df = state_df[cols].copy()

In [None]:
# state_df.info()

## Fix year values

In [None]:
state_df["year"] = state_df["year"].astype('int')

In [None]:
# display(state_df["year"].value_counts(3))

In [None]:
# Drop legacy record id column
state_df.drop(columns=['record'], inplace=True)

## Address any NaN values



In [None]:
# state_df.isna()
state_df = state_df.fillna(0)

In [None]:
# print(f"Shape: {state_df.shape}\n\n")
# print("Head:")
# display(state_df.head(3))
# print("Tail:")
# display(state_df.tail(3))

In [None]:
# Convert categorized data to int without decimals and fillna() with `0`
"""NOTE:
Convert these categorical columns to int and fill null values with 0.

"""
cols_int = ["sitetypenum", "survyear", "stratum", "PSU", "age", "sex", "grade", "race4", "race7", "qnobese", "qnowt", "q66", "q65", "sexid", "sexid2", "sexpart", "sexpart2", 
            "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23", "q24", "q25", "q26", "q27", "q28", "q29", 
            "q30", "q31", "q32", "q33", "q34", "q35", "q36", "q37", "q38", "q39", "q40", "q41", "q42", "q43", "q44", "q45", "q46", "q47", "q48", "q49", 
            "q50", "q51", "q52", "q53", "q54", "q55", "q56", "q57", "q58", "q59", "q60", "q61", "q62", "q63", "q64", "q67", "q68", "q69", 
            "q70", "q71", "q72", "q73", "q74", "q75", "q76", "q77", "q78", "q79", "q80", "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", 
            "qbikehelmet", "qdrivemarijuana", "qcelldriving", "qpropertydamage", "qbullyweight", "qbullygender", "qbullygay", "qchokeself", "qcigschool", 
            "qchewtobschool", "qalcoholschool", "qtypealcohol2", "qhowmarijuana", "qmarijuanaschool", "qcurrentopioid", "qcurrentcocaine", "qcurrentheroin", "qcurrentmeth", 
            "qhallucdrug", "qprescription30d", "qgenderexp", "qtaughtHIV", "qtaughtsexed", "qtaughtstd", "qtaughtcondom", "qtaughtbc", "qdietpop", "qcoffeetea", "qsportsdrink", 
            "qenergydrink", "qsugardrink", "qwater", "qfastfood", "qfoodallergy", "qwenthungry", "qmusclestrength", "qsunscreenuse", "qindoortanning", "qsunburn", "qconcentrating", 
            "qcurrentasthma", "qwheresleep", "qspeakenglish", "qtransgender"]

for col in cols_int:
    state_df[col] = state_df[col].fillna(0)
    state_df[col] = state_df[col].astype('int')
    # summary_df[col] = summary_df[col].astype('string')
# display(state_df.info())

In [None]:
state_df.dtypes

In [None]:
df = state_df.copy()
display(df.head(3))

## Save to SQL Database

In [None]:
connection_string = 'sqlite:///data/cdc_yrbs_state_data.db'
engine = create_engine(connection_string)
insp = inspect(engine)
df.to_sql('STATE', engine, index=False, if_exists='replace')
print(insp.get_table_names())
display(pd.read_sql_query("SELECT * FROM STATE LIMIT 3;", con=engine))
display(pd.read_sql_query("SELECT COUNT() FROM STATE", con=engine))

## Read Sample Data into Pandas

In [None]:
sql_data_df = pd.read_sql_query("SELECT * FROM STATE;", con=engine)
sql_data_df.shape

In [None]:
print(f"Imported from SQL Dataframe Shape: {sql_data_df.shape}\n\n")
print("Head:")
display(sql_data_df.head(3))
print("Tail:")
display(sql_data_df.tail(3))

In [None]:
def use_dict(column_in, value_in):
    """
    Translates values into understandable output.
    """
    if column_in == "age":
        age ={
            1: "12 years old or younger",
            2: "13 years old",
            3: "14 years old",
            4: "15 years old",
            5: "16 years old",
            6: "17 years old",
            7: "18 years old or older"
        }
        val = age.get(value_in)

    elif column_in == "sex":
        sex ={
            1: "Female",
            2: "Male",
            3: "Other"
        }
        val = sex.get(value_in)
        
    elif column_in == "grade":
        grade = {
            1: "9th grade",
            2: "10th grade",
            3: "11th grade",
            4: "12th grade",
            5: "Ungraded or other grade"
        }
        val = grade.get(value_in)

    elif column_in == "race4":
        race4 = {
            1: "White",
            2: "Black or African American",
            3: "Hispanic/Latino",
            4: "All Other Races"
        }
        val = race4.get(value_in)

    elif column_in == "grarace7de":
        race7 = {
            1: "American Indian/Alaska Native",
            2: "Asian",
            3: "Black or African American",
            4: "Hispanic/Latino",
            5: "Native Hawaiian/Other Pacific Islander",
            6: "White",
            7: "Multiple Races (Non-Hispanic)"
        }
        val = race7.get(value_in)
        
    elif column_in == "q66":
        q66 = {
            1: "Heterosexual (straight)",
            2: "Gay or lesbian",
            3: "Bisexual",
            4: "Not sure"
        }
        val = q66.get(value_in)
        
    elif column_in == "q65":
        q65 = {
            1: "I have never had sexual contact",
            2: "Females",
            3: "Males",
            4: "Females and males"
        }
        val = q65.get(value_in)
        
       
        
    return val

print(use_dict("age", 6))

In [None]:



# race4_dict = {
#     1: "White",
#     2: "Black or African American",
#     3: "Hispanic/Latino",
#     4: "All Other Races"
# }

# race7_dict = {
#     1: "American Indian/Alaska Native",
#     2: "Asian",
#     3: "Black or African American",
#     4: "Hispanic/Latino",
#     5: "Native Hawaiian/Other Pacific Islander",
#     6: "White",
#     7: "Multiple Races (Non-Hispanic)"
# }

# q66_dict = {
#     1: "Heterosexual (straight)",
#     2: "Gay or lesbian",
#     3: "Bisexual",
#     4: "Not sure"
# }

# q65_dict = {
#     1: "I have never had sexual contact",
#     2: "Females",
#     3: "Males",
#     4: "Females and males"
# }



In [None]:
# # Rewrite columns data from survey question mapping dictionaries
# df = state_df.copy()
# df=df.replace({"age": age_dict})
# df=df.replace({"sex": sex_dict})
# df=df.replace({"grade": grade_dict})
# df=df.replace({"race4": race4_dict})
# df=df.replace({"race7": race7_dict})
# df=df.replace({"q66": q66_dict})
# df=df.replace({"q65": q65_dict})
# df

In [None]:
df["age"].value_counts()

In [None]:
df["race4"].value_counts()

In [None]:
df.info()