# Exploratory analysis

We begin by loading the required libraries.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In the following cell, we define what particular cohort we would like to examine (chosen by selecting an education category, gender, income, and age). 

In [None]:
cohort_ed = 'Some College'
cohort_gender = 'Male'
cohort_income = '$70,000 - $74,999'
cohort_age = '65-74 years old'

We now load the subset of the entire dataset focused on people living in California above the age of 65. Note that while dataset was created by a teammate via Excel, one could also of course subset the original dataset using pandas to get the same result. 

In [None]:
df_std = pd.read_csv('Data/elderly-ppl-CA.csv')

Rather than consider all possible predictors and stds, we chose to focus on the following key values and prevalances: 

In [None]:
df_key = df_std[['age','gender','state','income','education','date','chlamydia',\
    'gential_warts','gonorrhea','herpes','hpv','other_std','parasitic','syphilis','trich']]
#df_key['income'].value_counts()

Unfortunately, as shown from the following cell, one of the tough things about this dataset was the number of [NaNs](https://en.wikipedia.org/wiki/NaN?oldformat=true) present: 

In [None]:
print(df_key.isna().sum().sum())

Consequently, in the following set of cells where our primary objective is to list out how many people are in each range of a given categorical variable, we also seek to find the cohort with the minimum set of NaNs. These are identified by labels such as `min_income_bracket` for the case of income. 

In [None]:
min = 100000
income_dict = {}
for income_bracket in df_key['income'].unique(): # for each unique income range 
    num_nans = df_key[df_key['income']==income_bracket].isna().sum().sum()
    if (num_nans < min):
        min = num_nans
        min_income_bracket = income_bracket
    income_dict[income_bracket] = num_nans

for key, value in sorted(income_dict.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

In [None]:
min = 100000
education_dict = {}
for education_bracket in df_key['education'].unique():
    num_nans = df_key[df_key['education']==education_bracket].isna().sum().sum()
    if (num_nans < min):
        min = num_nans
        min_education_bracket = education_bracket
    education_dict[education_bracket] = num_nans

for key, value in sorted(education_dict.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

In [None]:
min = 100000
gender_dict = {}
for gender_bracket in df_key['gender'].unique():
    num_nans = df_key[df_key['gender']==gender_bracket].isna().sum().sum()
    if (num_nans < min):
        min = num_nans
        min_gender_bracket = gender_bracket
    gender_dict[gender_bracket] = num_nans

for key, value in sorted(gender_dict.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

In [None]:
min = 100000
age_dict = {}
for age_bracket in df_key['age'].unique():
    num_nans = df_key[df_key['age']==age_bracket].isna().sum().sum()
    if (num_nans < min):
        min = num_nans
        min_age_bracket = age_bracket
    age_dict[age_bracket] = num_nans

for key, value in sorted(age_dict.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

Some observations we made from the above set of cells are as follows: 
- The majority of people in this subpopulation have a graduate education 
- The split among male and female is approximately even 
- There are approximately as many 65-74 year olds as there as 75+ year olds

One nice way we can visualize the data is to plot as a function of time STD rates for each STD for our desired cohort (given by the reduced dataframe `df_red`). 

In [None]:
df_red = df_key[(df_key['gender']==cohort_gender) & (df_key['age']==cohort_age) \
    & (df_key['education']==cohort_ed) & (df_key['income']==cohort_income)]
df_red = df_red.drop(['age', 'gender','state','education','income'], axis=1)
df_red['date'] = pd.to_datetime(df_red.date)
df_red = df_red.sort_values(by='date',ascending=True)

Displaying the resulting cohort: 

In [None]:
df_red.head(5)

In [None]:
plt.figure(figsize=(10,10))
plt.style.use('seaborn-darkgrid')

# create a color palette
palette = plt.get_cmap('Set1')

# multiple line plot
num=0
for column in df_red.drop('date', axis=1):
    num+=1
    plot_df = df_red[np.isfinite(df_red[column])]
    x = plot_df['date']
    y = plot_df[column]

    # Find the right spot on the plot
    plt.subplot(3,3, num)

    # plot every groups, but discreet
    for v in df_red.drop('date', axis=1):
        other_df = df_red[np.isfinite(df_red[v])]
        other_x = other_df['date']
        other_y = other_df[v]

        plt.plot(other_x, other_y, marker='', color='grey', linewidth=0.6, alpha=0.3)

    # Plot the lineplot
    plt.plot(x,y, marker='', color=palette(num), linewidth=2.4, alpha=0.9, label=column)

    # Same limits for everybody!
    # plt.xlim(0,10)
    # plt.ylim(-2,22)

    # Not ticks everywhere
    # if num in range(7) :
    #     plt.tick_params(labelbottom=False)
    # if num not in [1,4,7] :
    #     plt.tick_params(labelleft=False)

    # Add title
    plt.title(column, loc='left', fontsize=12, fontweight=0, color=palette(num) )

plt.savefig(('stds_most_sampled_' + cohort_gender + '.png'))

# general title
# plt.suptitle("hi how are you")
#plt.suptitle("How the 9 students improved\nthese past few days?", fontsize=13, fontweight=0, color='black', style='italic', y=1.02)

In [None]:
# plt.text(0.06, 0.5, 'Note', ha='center', va='center', rotation='vertical')

In [None]:
print(df_key.isna().sum())

In [None]:
df_red = df_key[(df_key['gender']==cohort_gender) & (df_key['age']==cohort_age) \
    & (df_key['income']==cohort_income)]
df_red = df_red.drop(['age', 'gender','state','income','chlamydia',\
    'gential_warts','gonorrhea','hpv','other_std','parasitic',\
    'syphilis','trich'], axis=1)
df_red['date'] = pd.to_datetime(df_red.date)
df_red = df_red.sort_values(by='date',ascending=True)
df_red = df_red[np.isfinite(df_red['herpes'])]
df_red.head()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

for key, grp in df_red.groupby(['education']):
    ax = grp.plot(ax=ax, kind='line', x='date', y='herpes', label=key,fontsize=16)

plt.legend(loc='best',fontsize = 12)
plt.xlabel('Date',fontsize=16)
plt.ylabel('Prevalance (Herpes)',fontsize=16)
plt.title(cohort_gender,fontsize=16)
plt.savefig(('herpes_vs_time_' + cohort_gender + '.png'))
plt.show()

In [None]:
#print(df_std.columns)
# df_std['entertainment_movies_spend'].describe()
# df_std['entertainment_movies_spend'].hist()
df_std.entertainment_movies_spend.value_counts().plot(kind='bar')