In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [54]:
df = pd.read_csv('RegionalInterestByConditionOverTime.csv')

In [55]:
df.head()

Unnamed: 0,dma,geoCode,2004+cancer,2004+cardiovascular,2004+stroke,2004+depression,2004+rehab,2004+vaccine,2004+diarrhea,2004+obesity,...,2016+diabetes,2017+cancer,2017+cardiovascular,2017+stroke,2017+depression,2017+rehab,2017+vaccine,2017+diarrhea,2017+obesity,2017+diabetes
0,Portland-Auburn ME,500,44,6,17,39,21,31,14,29,...,81,70,37,83,64,56,76,66,47,80
1,New York NY,501,47,6,13,38,16,33,12,27,...,77,70,34,53,56,53,79,56,52,78
2,Binghamton NY,502,48,3,16,50,12,37,24,31,...,74,68,24,71,69,44,77,78,61,72
3,Macon GA,503,44,14,14,37,19,49,14,29,...,78,53,38,62,46,60,47,53,41,66
4,Philadelphia PA,504,52,7,16,41,23,36,14,30,...,80,75,35,61,62,75,84,69,56,78


In [56]:
col_name = list(df.columns)

In [57]:
col_name

['dma',
 'geoCode',
 '2004+cancer',
 '2004+cardiovascular',
 '2004+stroke',
 '2004+depression',
 '2004+rehab',
 '2004+vaccine',
 '2004+diarrhea',
 '2004+obesity',
 '2004+diabetes',
 '2005+cancer',
 '2005+cardiovascular',
 '2005+stroke',
 '2005+depression',
 '2005+rehab',
 '2005+vaccine',
 '2005+diarrhea',
 '2005+obesity',
 '2005+diabetes',
 '2006+cancer',
 '2006+cardiovascular',
 '2006+stroke',
 '2006+depression',
 '2006+rehab',
 '2006+vaccine',
 '2006+diarrhea',
 '2006+obesity',
 '2006+diabetes',
 '2007+cancer',
 '2007+cardiovascular',
 '2007+stroke',
 '2007+depression',
 '2007+rehab',
 '2007+vaccine',
 '2007+diarrhea',
 '2007+obesity',
 '2007+diabetes',
 '2008+cancer',
 '2008+cardiovascular',
 '2008+stroke',
 '2008+depression',
 '2008+rehab',
 '2008+vaccine',
 '2008+diarrhea',
 '2008+obesity',
 '2008+diabetes',
 '2009+cancer',
 '2009+cardiovascular',
 '2009+stroke',
 '2009+depression',
 '2009+rehab',
 '2009+vaccine',
 '2009+diarrhea',
 '2009+obesity',
 '2009+diabetes',
 '2010+cancer'

In [58]:
diseases = []
for col in col_name:
    name = col.split('+')
    if name[-1] not in diseases:
        diseases.append(name[-1])
diseases = diseases[2:]

diseases

['cancer',
 'cardiovascular',
 'stroke',
 'depression',
 'rehab',
 'vaccine',
 'diarrhea',
 'obesity',
 'diabetes']

In [59]:
years = []
for col in col_name:
    name = col.split('+')
    if name[0] not in years:
        years.append(name[0])
years = years[2:]

years

['2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017']

## Let Us First Clean Up the Data
####  Wanted Columns: Country, Country Code, Year, Disease, Number of Searches

In [60]:
country = df['dma'].unique()
country_code = df['geoCode'].unique()

In [61]:
len(country)

210

In [62]:
# HOW TO CLEAN UP THE DATA???

## Searches per Disease per Year

In [90]:
# Dataframe format =
# Column 1 = Year
# Column 2 = Disease
# Column 3 = Count

# each year must appear len(disease) number of times in the list to match the disease length
year = [value for value in years for _ in range(len(diseases))]

# diseases must be repeated for len(year) number of times in the list to match the years
disease = diseases * len(years)

count = []

for col in df.columns[2:]:
    count.append(df[col].sum())

In [146]:
disease_df = pd.DataFrame([year, disease, count]).transpose()
disease_df.columns = ['Year', 'Disease', 'Total Number of Searches']

In [147]:
def to_int(str):
    return int(str)

In [149]:
disease_df['Year'] = disease_df['Year'].apply(to_int)
disease_df['Total Number of Searches'] = disease_df['Total Number of Searches'].apply(to_int)

In [150]:
disease_df.head()

Unnamed: 0,Year,Disease,Total Number of Searches
0,2004,cancer,9220
1,2004,cardiovascular,1561
2,2004,stroke,3705
3,2004,depression,9581
4,2004,rehab,3967


## Let us do some plotting!

In [151]:
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='ieching22', api_key='28VPsP5GenfxwZS7j3D5')

# For Notebooks
init_notebook_mode(connected=True)

In [154]:
def convert(d):
    # this converts each disease to a number
    for dis in disease:
        if d == dis:
            return int(disease.index(dis))

## First we seperate the dataframes

In [156]:
diseases

['cancer',
 'cardiovascular',
 'stroke',
 'depression',
 'rehab',
 'vaccine',
 'diarrhea',
 'obesity',
 'diabetes']

In [158]:
cancer_df = disease_df[disease_df['Disease'] == 'cancer']
cardiovascular_df = disease_df[disease_df['Disease'] == 'cardiovascular']
stroke_df = disease_df[disease_df['Disease'] == 'stroke']
depression_df = disease_df[disease_df['Disease'] == 'depression']
rehab_df = disease_df[disease_df['Disease'] == 'rehab']
vaccine_df = disease_df[disease_df['Disease'] == 'vaccine']
diarrhea_df = disease_df[disease_df['Disease'] == 'diarrhea']
obesity_df = disease_df[disease_df['Disease'] == 'obesity']
diabetes_df = disease_df[disease_df['Disease'] == 'diabetes']

## Now let us plot the data!

In [161]:
# Create traces
cancer = go.Scatter(
    x = cancer_df['Year'],
    y = cancer_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Cancer'
)
cardiovascular = go.Scatter(
    x = cardiovascular_df['Year'],
    y = cardiovascular_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Cardiovascular'
)
stroke = go.Scatter(
    x = stroke_df['Year'],
    y = stroke_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Stroke'
)
depression = go.Scatter(
    x = depression_df['Year'],
    y = depression_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Depression'
)
rehab = go.Scatter(
    x = rehab_df['Year'],
    y = rehab_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Rehab'
)
vaccine = go.Scatter(
    x = vaccine_df['Year'],
    y = vaccine_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Vaccine'
)
diarrhea = go.Scatter(
    x = diarrhea_df['Year'],
    y = diarrhea_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Diarrhea'
)
obesity = go.Scatter(
    x = obesity_df['Year'],
    y = obesity_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Obesity'
)
diabetes = go.Scatter(
    x = diabetes_df['Year'],
    y = diabetes_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Diabetes'
)
data = [cancer, cardiovascular, stroke, depression, rehab, vaccine, diarrhea, obesity, diabetes]

py.iplot(data)