In [3]:
import pandas as pd
import numpy as np
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')
import time
df_gt = pd.read_csv('data/BRFSS/2012_AA.csv', sep=";",usecols=["LocationDesc"])
df_gt.dropna(subset=['LocationDesc'], inplace=True)
df_gt.drop(df_gt.tail(2).index, inplace=True)
states = df_gt["LocationDesc"]

## 

In [4]:
def build_google_trends_df(kword, states, years=[2011,2012,2013,2014,2015,2016], output=True):
    if output:
        print("calculating indexes for", kword)
    yearly_google_trends_df = pd.DataFrame(index=states, columns=years, dtype=np.float)
    for year in years:
        values = []
        try:
            pytrends.build_payload([kword], cat=0, timeframe=str(year)+'-01-01'+ ' ' + str(year) + '-12-31', geo='US', gprop='')
            trends_per_region_df = pytrends.interest_by_region()
            for state in states:
                if state not in trends_per_region_df.index:
                    values.append(0)
                else: 
                    values.append(int(trends_per_region_df.loc[state].values))
            yearly_google_trends_df[year] = values
        except: 
            raise Exception('Not relevant keyword in GoogleTrends database')
    return yearly_google_trends_df

In [5]:
def overall_correlation_df(gt_filename, google_trends_keywords, years):
    df_gt = pd.read_csv(gt_filename, sep=";",usecols=["LocationDesc","Data_Value"])
    df_gt.dropna(subset=['LocationDesc'], inplace=True)
    df_gt.drop(df_gt.tail(2).index, inplace=True)
    states = df_gt["LocationDesc"]
    bad_keywords = []
    year_average_corr = []
    for kw in google_trends_keywords:
        try:
            df = build_google_trends_df(kw, states=states, years=years, output=True )
            df["ground_truth_value"] = df_gt["Data_Value"].values
            corr_per_year= []
            for year in years:
                corr = pearsonr(df[year], df["ground_truth_value"])[0]
                if np.isnan(corr):
                    raise Exception('Pearson correlation resulted in NaN')
                corr_per_year.append(corr) # pos0: corr; pos1: p_value
            year_average_corr.append(np.mean(corr_per_year))
        except:
            bad_keywords.append(kw)
    good_keywords = [kw for kw in google_trends_keywords if kw not in bad_keywords] # remove the bad keywords, ie. with no results in the period
    corr_df = pd.DataFrame(index=good_keywords,data=year_average_corr, columns=["correlation"])
    return corr_df

In [6]:
wls = pd.read_csv("data/kidney_disease_related_terms.csv",sep=".", names=["related_terms"])
kw_list = wls["related_terms"].str.strip()
years = [2011,2012,2013,2014,2015,2016]

folder = 'data/BRFSS/'
#files_sufix = ['2011_AA','2012_AA','2013_AA','2014_AA','2015_AA','2016_AA','2011','2012','2013','2014','2015','2016']
files_sufix = ['2011']
overall_corr_dict = {}
for sufix in files_sufix:
    df = overall_correlation_df(gt_filename=folder+sufix+'.csv',
                                google_trends_keywords=kw_list,
                                years = years)
    df.to_csv('data/correlation/overall_correlation_' + sufix + '.csv',sep=";", index=True, header=True, index_label='keyword')
    overall_corr_dict[sufix] = df

calculating indexes for liver disease
calculating indexes for diabetes
calculating indexes for renal failure
calculating indexes for hypertension
calculating indexes for disease
calculating indexes for kidney
calculating indexes for cardiovascular disease
calculating indexes for cirrhosis
calculating indexes for nephritis
calculating indexes for cardiomyopathy
calculating indexes for emphysema
calculating indexes for cystic fibrosis
calculating indexes for glaucoma
calculating indexes for sickle cell anemia
calculating indexes for diabetes mellitus
calculating indexes for coronary artery disease
calculating indexes for juvenile diabetes
calculating indexes for congestive heart failure
calculating indexes for genetic disorder
calculating indexes for anemia
calculating indexes for pancreatitis
calculating indexes for atherosclerosis
calculating indexes for hypothyroidism
calculating indexes for diabetic
calculating indexes for Kawasaki Disease
calculating indexes for degenerative disorde

In [29]:
# from a datafram |keyword|correlation|, get a subset of this dataframe according to the threshold, which can be float or int.
# If it is float, it will return the keywords with correlation greater or equal then threshold. If it, it will return the first
# threshold-terms. If not a float, nor a int is passed, raise an AttributeError.
def get_most_correlated_terms_df(df, threshold=0.1):
    # order the keywords by the correlation average value
    df.sort_values(by="correlation", ascending=False, inplace=True)
    if isinstance(threshold, float):
        return df.loc[df["correlation"] >= threshold]
    elif isinstance(threshold, int):
        return df.head(threshold)
    raise AttributeError("Exception in get_most_correlated_terms method. 'threshold' must be float or int.")

In [49]:
# from a datafram |keyword|correlation|, get a subset of this dataframe according to the threshold, which can be float or int.
# If it is float, it will return the keywords with correlation greater or equal then threshold. If it, it will return the first
# threshold-terms. If not a float, nor a int is passed, raise an AttributeError.
def get_compound_most_correlated_terms_df(location='data/correlation/', file_type="AA", 
                                          years = [2011,2012,2013,2014,2015,2016], threshold=0.1):
    most_correlated_df = pd.DataFrame(columns=["keyword","correlation"])
    
    sufix = ""
    if file_type == 'AA':
        sufix = "_" + file_type
    for year in years:
        df = pd.read_csv(location + 'overall_correlation_'+ str(year) + sufix + '.csv', sep=';')
        df = get_most_correlated_terms_df(df, threshold=threshold)
        most_correlated_df = most_correlated_df.append(get_most_correlated_terms_df(df, threshold=threshold))
    df.sort_values(by="correlation", ascending=False, inplace=True)
    most_correlated_df.drop_duplicates(subset="keyword", keep="first", inplace=True)
    return most_correlated_df

In [32]:
def get_google_trends_most_correlated_terms_df_old(gt_filename, threshold, years = [2011,2012,2013,2014,2015,2016]):
    df = pd.read_csv(gt_filename, sep=';')
    most_correlated_keywords = (get_most_correlated_terms_df(df, threshold))["keyword"].values
    
    print("With the threshold applied it was found " + str(len(most_correlated_keywords)) + " keywords.")
    
    output_df = pd.DataFrame(columns=["State", "Year"] + list(most_correlated_keywords))
    first_iteration = True
    for keyword in most_correlated_keywords:
        df = build_google_trends_df(keyword, states, years=[2011,2012,2013,2014,2015,2016], output=True)
        series = df.stack()  # convert columns into rows, it returns a series with just one column
        series.to_csv(path="data/temp/temp.csv", sep="\t", header=True)
        df = pd.read_csv("data/temp/temp.csv", sep="\t")
        df.reset_index(drop=True, inplace=True)
        df.columns = ["State", "Year", keyword]
        df.sort_values(['Year','State'], inplace=True)

        # in the first iteration it is necessary to populate the year and state columns
        if first_iteration:
            output_df = df
            first_iteration = False
        else:
            output_df[keyword] = df[keyword]
    return output_df

In [33]:
def get_google_trends_most_correlated_terms_df(file_type="AA", threshold=0.1, years = [2011,2012,2013,2014,2015,2016]):
    # get the most correlated terms through all the years compering with all the ground truth data availabe (for each type,
    # for example, AA or CP)
    most_correlated_keywords = (get_compound_most_correlated_terms_df(location='data/correlation/', 
                                                                      file_type="AA",
                                                                      years = [2011,2012,2013,2014,2015,2016], 
                                                                      threshold=0.1))["keyword"].values
    
    print("With the threshold applied it was found " + str(len(most_correlated_keywords)) + " keywords.")
    
    output_df = pd.DataFrame(columns=["State", "Year"] + list(most_correlated_keywords))
    first_iteration = True
    for keyword in most_correlated_keywords:
        df = build_google_trends_df(keyword, states, years=[2011,2012,2013,2014,2015,2016], output=True)
        series = df.stack()  # convert columns into rows, it returns a series with just one column
        series.to_csv(path="data/temp/temp.csv", sep="\t", header=True)
        df = pd.read_csv("data/temp/temp.csv", sep="\t")
        df.reset_index(drop=True, inplace=True)
        df.columns = ["State", "Year", keyword]
        df.sort_values(['Year','State'], inplace=True)

        # in the first iteration it is necessary to populate the year and state columns
        if first_iteration:
            output_df = df
            first_iteration = False
        else:
            output_df[keyword] = df[keyword]
    return output_df

# for ex2

In [None]:
def build_regression_df(folder = 'data/BRFSS/', threshold=0.1, years = [2011,2012,2013,2014,2015,2016]):
    
    gt_type = ['_AA','CP']  # age adjusted or crude prevalence
    for type_ in gt_type:
        df_gt_output = pd.DataFrame(columns="Data_Value")
        for year in years:
            gt_filename = folder + str(year) + gt_type + '.csv'
            df_gt = pd.read_csv(gt_filename, sep=";", usecols=["Data_Value"])
            df_gt.drop(df_gt.tail(2).index, inplace=True)  # remove the results for puerto rico e guam
            df_gt_output = df_gt_output.append(df_gt)
            
    output = pd.read_csv('data/output/output_complete.csv', sep=';')
    
    
        
folder = 'data/BRFSS/'
#files_sufix = ['2011_AA','2012_AA','2013_AA','2014_AA','2015_AA','2016_AA','2011','2012','2013','2014','2015','2016']
files_sufix = ['2016_AA']
overall_corr_dict = {}
for sufix in files_sufix:
    df = overall_correlation_df(gt_filename=folder+sufix+'.csv',
                                google_trends_keywords=kw_list,
                                years = years)
    df.to_csv('data/correlation/overall_correlation_' + sufix + '.csv',sep=";", index=True, header=True, index_label='keyword')
    overall_corr_dict[sufix] = df
    
df_gt = pd.read_csv(gt_filename, sep=";",usecols=["LocationDesc","Data_Value"])
    df_gt.dropna(subset=['LocationDesc'], inplace=True)
    df_gt.drop(df_gt.tail(2).index, inplace=True)
    states = df_gt["LocationDesc"]
    bad_keywords = []
    year_average_corr = []
    for kw in google_trends_keywords:
        try:
            df = build_google_trends_df(kw, states=states, years=years, output=True )
            df["ground_truth_value"] = df_gt["Data_Value"].values

In [None]:
output = get_google_trends_most_correlated_terms_df(file_type="AA", threshold=0.1, years = [2011,2012,2013,2014,2015,2016])
output.to_csv("data/output/output_complete.csv")

In [None]:
plt.figure(figsize=(13,7))
plt.plot(corr_df, label ="Age Adjusted prevalence")

plt.legend(fontsize="small")
plt.xlabel("query term")
plt.ylabel("Cor value ")
plt.title("Correlation")
plt.grid()
#plt.xticks(queries)

plt.show()

In [50]:
year = 2011
kword = "renal"
pytrends.build_payload([kword], cat=0, timeframe=str(year)+'-01-01'+ ' ' + str(year) + '-12-31', geo='US', gprop='')
trends_per_region_df = pytrends.interest_by_region()
trends_per_region_df

Unnamed: 0_level_0,renal
geoName,Unnamed: 1_level_1
Alabama,73
Alaska,54
Arizona,69
Arkansas,71
California,47
Colorado,53
Connecticut,65
Delaware,63
District of Columbia,52
Florida,64
