## Dataset Information:
### Health Searches Dataset: USA (2004 - 2017)
### Suicide Dataset: WORLD (1987 - 2016)

## Importing the required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf

plotly.tools.set_credentials_file(username='ieching22', api_key='28VPsP5GenfxwZS7j3D5')

## Getting and cleaning our dataframes

In [2]:
health_df = pd.read_csv('Health-Searches-US-Country/RegionalInterestByConditionOverTime.csv')
suicide_df = pd.read_csv('suicide-rates-overview-1985-to-2016/master.csv')

In [3]:
# cleaning up column names for suicide df
suicide_df.columns = ['country', 'year', 'sex', 'age', 'suicides_no', 'population', 'suicides/100k pop', 'country-year', 'HDI for year', 'gdp_for_year', 'gdp_per_capita', 'generation']

## Lets find the mutual years the data includes

In [4]:
suicide_df['year'].unique()

array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016],
      dtype=int64)

In [5]:
health_df.columns

Index(['dma', 'geoCode', '2004+cancer', '2004+cardiovascular', '2004+stroke',
       '2004+depression', '2004+rehab', '2004+vaccine', '2004+diarrhea',
       '2004+obesity',
       ...
       '2016+diabetes', '2017+cancer', '2017+cardiovascular', '2017+stroke',
       '2017+depression', '2017+rehab', '2017+vaccine', '2017+diarrhea',
       '2017+obesity', '2017+diabetes'],
      dtype='object', length=128)

The data for suicide rates range from years 1987 to 2016; The data for health searches range from years 2004 to 2017
#### Mutual years: 2004 - 2016

### However, there seems to be weird behavior in 2016 for the suicide rates - likely because insufficient data is collected. Let us stop at 2015

# Is there a correlation between Health Searches and Suicide Rates?

## Lets start by analyzing the progression of health searches

In [6]:
health_df.head()

Unnamed: 0,dma,geoCode,2004+cancer,2004+cardiovascular,2004+stroke,2004+depression,2004+rehab,2004+vaccine,2004+diarrhea,2004+obesity,...,2016+diabetes,2017+cancer,2017+cardiovascular,2017+stroke,2017+depression,2017+rehab,2017+vaccine,2017+diarrhea,2017+obesity,2017+diabetes
0,Portland-Auburn ME,500,44,6,17,39,21,31,14,29,...,81,70,37,83,64,56,76,66,47,80
1,New York NY,501,47,6,13,38,16,33,12,27,...,77,70,34,53,56,53,79,56,52,78
2,Binghamton NY,502,48,3,16,50,12,37,24,31,...,74,68,24,71,69,44,77,78,61,72
3,Macon GA,503,44,14,14,37,19,49,14,29,...,78,53,38,62,46,60,47,53,41,66
4,Philadelphia PA,504,52,7,16,41,23,36,14,30,...,80,75,35,61,62,75,84,69,56,78


### Searches per Disease per Year (we want to look at 2004 - 2015)

In [7]:
col_name = list(health_df.columns)

diseases = []
for col in col_name:
    name = col.split('+')
    if name[-1] not in diseases:
        diseases.append(name[-1])
diseases = diseases[2:]

years = []
for col in col_name:
    name = col.split('+')
    if name[0] not in years:
        years.append(name[0])
years = years[2:-2]

In [8]:
diseases

['cancer',
 'cardiovascular',
 'stroke',
 'depression',
 'rehab',
 'vaccine',
 'diarrhea',
 'obesity',
 'diabetes']

In [9]:
years

['2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015']

In [10]:
# Dataframe format
# Column 1 = Year
# Column 2 = Disease
# Column 3 = Count

# each year must appear len(disease) number of times in the list to match the disease length
year = [value for value in years for _ in range(len(diseases))]

# diseases must be repeated for len(year) number of times in the list to match the years
disease = diseases * len(years)

count = []

for col in health_df.columns[2:]:
    count.append(health_df[col].sum())

In [11]:
disease_df = pd.DataFrame([year, disease, count]).transpose()
disease_df.columns = ['Year', 'Disease', 'Total Number of Searches']

In [12]:
def to_int(str):
    return int(str)

In [13]:
disease_df.head()

Unnamed: 0,Year,Disease,Total Number of Searches
0,2004,cancer,9220
1,2004,cardiovascular,1561
2,2004,stroke,3705
3,2004,depression,9581
4,2004,rehab,3967


In [14]:
disease_df.dropna(inplace=True)

In [15]:
disease_df['Year'] = disease_df['Year'].apply(to_int)

In [16]:
disease_df.head()

Unnamed: 0,Year,Disease,Total Number of Searches
0,2004,cancer,9220
1,2004,cardiovascular,1561
2,2004,stroke,3705
3,2004,depression,9581
4,2004,rehab,3967


In [17]:
cancer_df = disease_df[disease_df['Disease'] == 'cancer']
cardiovascular_df = disease_df[disease_df['Disease'] == 'cardiovascular']
stroke_df = disease_df[disease_df['Disease'] == 'stroke']
depression_df = disease_df[disease_df['Disease'] == 'depression']
rehab_df = disease_df[disease_df['Disease'] == 'rehab']
vaccine_df = disease_df[disease_df['Disease'] == 'vaccine']
diarrhea_df = disease_df[disease_df['Disease'] == 'diarrhea']
obesity_df = disease_df[disease_df['Disease'] == 'obesity']
diabetes_df = disease_df[disease_df['Disease'] == 'diabetes']

In [18]:
# Create traces
cancer = go.Scatter(
    x = cancer_df['Year'],
    y = cancer_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Cancer'
)
cardiovascular = go.Scatter(
    x = cardiovascular_df['Year'],
    y = cardiovascular_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Cardiovascular'
)
stroke = go.Scatter(
    x = stroke_df['Year'],
    y = stroke_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Stroke'
)
depression = go.Scatter(
    x = depression_df['Year'],
    y = depression_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Depression'
)
rehab = go.Scatter(
    x = rehab_df['Year'],
    y = rehab_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Rehab'
)
vaccine = go.Scatter(
    x = vaccine_df['Year'],
    y = vaccine_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Vaccine'
)
diarrhea = go.Scatter(
    x = diarrhea_df['Year'],
    y = diarrhea_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Diarrhea'
)
obesity = go.Scatter(
    x = obesity_df['Year'],
    y = obesity_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Obesity'
)
diabetes = go.Scatter(
    x = diabetes_df['Year'],
    y = diabetes_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Diabetes'
)
data = [cancer, cardiovascular, stroke, depression, rehab, vaccine, diarrhea, obesity, diabetes]

# plotting the data
py.iplot(data)


Consider using IPython.display.IFrame instead



## Now lets look at the progression of suicide rates

In [35]:
# Let us look at only the United States of America
suicide_df = suicide_df[suicide_df["country"] == 'United States']

In [36]:
suicide_df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year,gdp_per_capita,generation
26848,United States,1985,male,75+ years,2177,4064000,53.57,United States1985,0.841,4346734000000,19693,G.I. Generation
26849,United States,1985,male,55-74 years,5302,17971000,29.5,United States1985,0.841,4346734000000,19693,G.I. Generation
26850,United States,1985,male,25-34 years,5134,20986000,24.46,United States1985,0.841,4346734000000,19693,Boomers
26851,United States,1985,male,35-54 years,6053,26589000,22.77,United States1985,0.841,4346734000000,19693,Silent
26852,United States,1985,male,15-24 years,4267,19962000,21.38,United States1985,0.841,4346734000000,19693,Generation X


### Total Suicides per Year (we want to look at 2004 - 2016)

In [37]:
years = suicide_df['year'].unique()
male_suicides = []
female_suicides = []
total_suicides = []

for year in years:
    suicides = suicide_df[suicide_df['year'] == year]['suicides_no'].sum()
    total_suicides.append(suicides)
    
    male = suicide_df[(suicide_df['year'] == year) & (suicide_df['sex'] == 'male')]['suicides_no'].sum()
    male_suicides.append(male)
    
    female = suicide_df[(suicide_df['year'] == year) & (suicide_df['sex'] == 'female')]['suicides_no'].sum()
    female_suicides.append(female)

In [38]:
year_df = pd.DataFrame([years, male_suicides, female_suicides, total_suicides]).transpose()
year_df.columns = ['Year', 'Male Suicides', 'Female Suicides', 'Total Suicides']

In [39]:
year_df.sort_values(by='Year', inplace=True)

In [40]:
year_df = year_df.iloc[19:-1]

In [41]:
year_df

Unnamed: 0,Year,Male Suicides,Female Suicides,Total Suicides
19,2004,25555,6873,32428
20,2005,25900,6729,32629
21,2006,26300,6992,33292
22,2007,27267,7329,34596
23,2008,28447,7583,36030
24,2009,29079,7821,36900
25,2010,30275,8087,38362
26,2011,30996,8512,39508
27,2012,31777,8819,40596
28,2013,32049,9094,41143


In [42]:
total = go.Scatter(
    x = year_df['Year'],
    y = year_df['Total Suicides'],
    mode = 'lines+markers',
    name = 'Total Suicides'
)

male = go.Scatter(
    x = year_df['Year'],
    y = year_df['Male Suicides'],
    mode = 'lines+markers',
    name = 'Male Suicides'
)

female = go.Scatter(
    x = year_df['Year'],
    y = year_df['Female Suicides'],
    mode = 'lines+markers',
    name = 'Female Suicides'
)

data = [total, male, female]

# plotting the data

py.iplot(data)

## Is there a relation between searches and total suicides?

### Let us first normalize the data to between 0 and 1

In [43]:
# We first need to further clean the dataframe
depression_df.drop(columns=['Disease'], inplace=True)
rehab_df.drop(columns=['Disease'], inplace=True)
obesity_df.drop(columns=['Disease'], inplace=True)

KeyError: "['Disease'] not found in axis"

In [44]:
from sklearn import preprocessing

final_years = list(range(2004, 2016))

# normalizing the suicide dataframe
x = year_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
year_df = pd.DataFrame(x_scaled)

# normalizing the health dataframe
# depression
x = depression_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
depression_df = pd.DataFrame(x_scaled)

# rehab
x = rehab_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
rehab_df = pd.DataFrame(x_scaled)

# obesity
x = obesity_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
obesity_df = pd.DataFrame(x_scaled)


Data with input dtype int64 was converted to float64 by MinMaxScaler.



In [45]:
year_df.columns = ['Year', 'Male Suicides', 'Female Suicides', 'Total Suicides']
depression_df.columns = ['Year', 'Total Number of Searches']
rehab_df.columns = ['Year', 'Total Number of Searches']
obesity_df.columns = ['Year', 'Total Number of Searches']

In [46]:
total = go.Scatter(
    x = final_years,
    y = year_df['Total Suicides'],
    mode = 'lines+markers',
    name = 'Total Suicides'
)

depression = go.Scatter(
    x = final_years,
    y = depression_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Depression'
)

rehab = go.Scatter(
    x = final_years,
    y = rehab_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Rehab'
)

obesity = go.Scatter(
    x = final_years,
    y = obesity_df['Total Number of Searches'],
    mode = 'lines+markers',
    name = 'Obesity'
)

data = [total, depression, rehab, obesity]

py.iplot(data)


Consider using IPython.display.IFrame instead



# There seems to be a positive correlation between searches for depression and total suicides!

## Let us do some additional plotting