# Gather Google Trends data
## Author: John Hennigan
### This notebook can be used to gather data from the Google Trends API using the pytrends library
### <b>IMPORTANT NOTES <b>:
 #### The sleep call is needed when extracting data from pytrends as google throttles the amount of API calls you can make over a given period of time, this sleep ensures you stay within the limit and do not end up with incomplete data 

 #### This note book includes aggregates the data on year and state, from this mean, median, min, and max, are extracted. To change the time on which the data is gathered you can add values to the timeframe array. Google will only return so much data so if the results you are getting are less than anticipated try breaking the time frame into sections

 #### Google limits the amount of search terms you can look for at a time to 5, this is why there are multiple calls made to the API using different term sets

 #### The final code block is used to show some example charts and how the data can be interacted with

In [13]:
#initialize pytrends and import statements
import statistics
import pandas as pd
from pytrends.request import TrendReq
from time import sleep
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
pytrends = TrendReq(hl='en-US', tz=240)

In [14]:
#Create the method to extract they keyword and data for the given timeframe

def gatherDataForKeyWord(wordlist, stateCode, timeperiod):
    pytrends.build_payload(wordlist, cat=0, timeframe=timeperiod, geo=stateCode, gprop='')
    sleep(1.5)
    return pytrends.interest_over_time()



#Method to convert the data from pytrends df to project structure
def convertPytrends(df, keywordCols, wordDf):
    for col in keywordCols:
        colAsList = df[col].to_dict()
        yearsValues = extractYearValues(colAsList, wordDf)


#Method to extract the min max mean and median from the pytrends col
def extractYearValues(colAsDict, wordDF):
    for key in colAsDict.keys():
        currentKey = key.year
        if currentKey in wordDF:
            wordDF[currentKey].append(colAsDict[key])
        else:
            wordDF.update({currentKey : [colAsDict[key]]})

    return wordDF

def getTrendData(yearDict):
    yearVals = {}
    for year in yearDict.keys():
        yearMin = min(yearDict[year])
        yearMax = max(yearDict[year])
        yearMean = statistics.mean(yearDict[year])
        yearMedian = statistics.median(yearDict[year])
        yearVals.update({year : {'min' : yearMin, 'max' : yearMax, 'mean' : yearMean, 'median' : yearMedian}})

    return yearVals

In [None]:
colKeyWordsDiagnoses = ['depression', 'anxiety', 'ADHD', 'bipolar', 'PTSD']
colKeyWordsPhrases = ['psychiatrists near me', 'psychologist near me', 'therapist near me', 
               'mental hospital']
timeframes = ['2013-01-01 2024-12-31'] #,'2016-01-01 2020-12-31', '2021-01-01 today']
resultDict = {}
stateCodes = [ 'US-AL', 'US-AK', 'US-AZ', 'US-AR', 'US-CA', 'US-CO', 'US-CT', 'US-DC', 'US-DE', 
              'US-FL', 'US-GA', 'US-HI', 'US-ID', 'US-IL', 'US-IN', 'US-IA', 'US-KS', 'US-KY', 
              'US-LA', 'US-ME', 'US-MD', 'US-MA', 'US-MI', 'US-MN', 'US-MS', 'US-MO', 'US-MT', 
              'US-NE', 'US-NV', 'US-NH', 'US-NJ', 'US-NM', 'US-NY', 'US-NC', 'US-ND', 'US-OH', 
               'US-OK', 'US-OR', 'US-PA', 'US-RI', 'US-SC', 'US-SD', 'US-TN', 'US-TX', 'US-UT', 
               'US-VT', 'US-VA', 'US-WA', 'US-WV', 'US-WI', 'US-WY']
for state in stateCodes:
    stateDict = {}
    wordDf = {}
    for time in timeframes:
        print(f"State : {state}, Time : {time}")
        pyTrendsDf = gatherDataForKeyWord(colKeyWordsDiagnoses, state, time)
        for word in colKeyWordsDiagnoses:
            wordDf = {}
            convertPytrends(pyTrendsDf, [word], wordDf)
            yearTrendData = getTrendData(wordDf)
            stateDict.update({word : yearTrendData})

    for time in timeframes:
        pyTrendsDf = gatherDataForKeyWord(colKeyWordsPhrases, state, time)
        for word in colKeyWordsPhrases:
            wordDf = {}
            convertPytrends(pyTrendsDf, [word], wordDf)
            yearTrendData = getTrendData(wordDf)
            stateDict.update({word : yearTrendData})
    resultDict.update({state : stateDict})
print(resultDict)

In [17]:
rows = []
for state, conditions in resultDict.items():
    for condition, years in conditions.items():
        for year, stats in years.items():
            key = f'{state}_{year}'
            row = {'key': key, 'condition': condition}
            row.update(stats)
            rows.append(row)

# Creating the DataFrame
df = pd.DataFrame(rows)

# Pivoting the DataFrame
df_pivot = df.pivot_table(index='key', columns='condition', values=['min', 'max', 'mean', 'median'])

# Flatten the MultiIndex columns
df_pivot.columns = ['_'.join([str(i) for i in col]).strip() for col in df_pivot.columns.values]

# Reset the index
df_pivot.reset_index(inplace=True)

# Save to CSV
df_pivot.to_csv("../data/pytrends_data.csv")


In [None]:


# Load the data
df = pd.read_csv("../data/pytrends_data.csv")

# List of mean columns to plot
mean_columns = ['mean_ADHD', 'mean_PTSD', 'mean_anxiety', 'mean_bipolar', 'mean_depression', 
                'mean_mental hospital', 'mean_psychiatrists near me', 'mean_psychologist near me', 
                'mean_therapist near me']

# Function to extract state and year from the key
def extractStateYear(row):
    state, year = row['key'].split('_')
    row['state'] = state
    row['year'] = int(year)
    return row

# Apply the function to extract state and year
df = df.apply(extractStateYear, axis=1)

# Define a gradient color palette using seaborn
palette = sns.color_palette("coolwarm", len(df['state'].unique()))
color_dict = {state: palette[i] for i, state in enumerate(df['state'].unique())}

# Define shapes
shapes = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']
shape_cycle = cycle(shapes)
shape_dict = {state: next(shape_cycle) for state in df['state'].unique()}

# Create a separate legend plot
plt.figure(figsize=(12, 3))
for state in df['state'].unique():
    plt.plot([], [], color=color_dict[state], marker=shape_dict[state], linestyle='None', label=state)
plt.legend(ncol=6, fontsize='small', title="State Color and Shape Key", loc='upper center', bbox_to_anchor=(0.5, -0.05))
plt.axis('off')
plt.show()

# Plotting each mean column for each state
for mean_column in mean_columns:
    plt.figure(figsize=(10, 6))
    
    for state in df['state'].unique():
        state_data = df[df['state'] == state]
        plt.plot(state_data['year'], state_data[mean_column], label=state, color=color_dict[state], marker=shape_dict[state], linestyle='-')
    
    plt.xlabel('Year')
    plt.ylabel(mean_column)
    plt.title(f'Change in {mean_column} Over Years for Each State')
    plt.grid(True)
    plt.show()
