In [2]:
import pandas as pd
import numpy as np
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')
import time

In [3]:
def build_google_trends_df(kword, states, years=[2011,2012,2013,2014,2015,2016], output=True):
    if output:
        print("calculating indexes for", kword)
    yearly_google_trends_df = pd.DataFrame(index=states, columns=years, dtype=np.float)
    for year in years:
        values = []
        try:
            pytrends.build_payload([kword], cat=0, timeframe=str(year)+'-01-01'+ ' ' + str(year) + '-12-31', geo='US', gprop='')
            trends_per_region_df = pytrends.interest_by_region()
            for state in states:
                if state not in trends_per_region_df.index:
                    values.append(0)
                else: 
                    values.append(int(trends_per_region_df.loc[state].values))
            yearly_google_trends_df[year] = values
        except: 
            raise Exception('Not relevant keyword in GoogleTrends database')
    return yearly_google_trends_df

In [4]:
def overall_correlation_df(gt_filename, google_trends_keywords, years):
    df_gt = pd.read_csv(gt_filename, sep=";",usecols=["LocationDesc","Data_Value"])
    df_gt.dropna(subset=['LocationDesc'], inplace=True)
    df_gt.drop(df_gt.tail(2).index, inplace=True)
    states = df_gt["LocationDesc"]
    bad_keywords = []
    year_average_corr = []
    for kw in google_trends_keywords:
        try:
            df = build_google_trends_df(kw, states=states, years=years, output=True )
            df["ground_truth_value"] = df_gt["Data_Value"].values
            corr_per_year= []
            for year in years:
                corr = pearsonr(df[year], df["ground_truth_value"])[0]
                if np.isnan(corr):
                    raise Exception('Pearson correlation resulted in NaN')
                corr_per_year.append(corr) # pos0: corr; pos1: p_value
            year_average_corr.append(np.mean(corr_per_year))
            time.sleep(1)
        except:
            bad_keywords.append(kw)
    good_keywords = [kw for kw in google_trends_keywords if kw not in bad_keywords] # remove the bad keywords, ie. with no results in the period
    corr_df = pd.DataFrame(index=good_keywords,data=year_average_corr, columns=["correlation"])
    return corr_df

In [None]:
wls = pd.read_csv("data/kidney_disease_related_terms.csv",sep=".", names=["related_terms"])
kw_list = wls["related_terms"].str.strip()
years = [2011,2012,2013,2014,2015,2016]

folder = 'data/BRFSS/'
#files_sufix = ['2011_AA','2012_AA','2013_AA','2014_AA','2015_AA','2016_AA','2011','2012','2013','2014','2015','2016']
#files_sufix = ['2011_AA']
files_sufix = ['2012_AA','2013_AA','2014_AA','2015_AA','2016_AA','2011','2012','2013','2014','2015','2016']
overall_corr_dict = {}
for sufix in files_sufix:
    df = overall_correlation_df(gt_filename=folder+sufix+'.csv',
                                google_trends_keywords=kw_list,
                                years = years)
    df.to_csv(folder + 'overall_correlation_' + sufix + '.csv',sep=";", index=True, header=True, index_label='keyword')
    overall_corr_dict[sufix] = df
    


calculating indexes for liver disease
calculating indexes for diabetes
calculating indexes for renal failure
calculating indexes for hypertension
calculating indexes for disease
calculating indexes for kidney
calculating indexes for cardiovascular disease
calculating indexes for cirrhosis
calculating indexes for nephritis
calculating indexes for cardiomyopathy
calculating indexes for emphysema
calculating indexes for cystic fibrosis
calculating indexes for glaucoma
calculating indexes for sickle cell anemia
calculating indexes for diabetes mellitus
calculating indexes for coronary artery disease
calculating indexes for juvenile diabetes
calculating indexes for congestive heart failure
calculating indexes for genetic disorder
calculating indexes for anemia
calculating indexes for pancreatitis


In [1]:
overall_corr_dict

NameError: name 'overall_corr_dict' is not defined

In [5]:
most_correlated = corr_df.loc[overall_corr_dict["correlation"] > 0.05].values

NameError: name 'corr_df' is not defined

In [None]:
plt.figure(figsize=(13,7))
plt.plot(corr_df, label ="Age Adjusted prevalence")

plt.legend(fontsize="small")
plt.xlabel("query term")
plt.ylabel("Cor value ")
plt.title("Correlation")
plt.grid()
#plt.xticks(queries)

plt.show()

In [None]:
year = 2011
kword = "renal"
pytrends.build_payload([kword], cat=0, timeframe=str(year)+'-01-01'+ ' ' + str(year) + '-12-31', geo='US', gprop='')
trends_per_region_df = pytrends.interest_by_region()
trends_per_region_df