In [1]:
from pytrends.request import TrendReq
import json
import pandas as pd  
import numpy as np 
import joypy
import matplotlib.pyplot as plt
from  matplotlib import cm

%cd ..

/mnt/c/Users/jacda/OneDrive - ITU/Documents/ITU/Data_in_the_wild/EV_Project


Since the unofficial Google Trends API, pytrends only supported city based queries for the US we had to redefine the TrendReq class to include cities from other countries as well.

In [141]:

class MyTrendReq(TrendReq):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
                           inc_geo_code=False):
        """Request data from Google's Interest by Region section and return a dataframe"""

        # make the request
        region_payload = dict()

        if self.geo == '': 
            self.interest_by_region_widget['request']['resolution'] = resolution 
        elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']: #DMA only exists for US
            self.interest_by_region_widget['request']['resolution'] = resolution 
        elif len(self.geo) == 2 and resolution in ['CITY', 'REGION']: #If not US
            self.interest_by_region_widget['request']['resolution'] = resolution        

        self.interest_by_region_widget['request'][
            'includeLowSearchVolumeGeos'] = inc_low_vol

        # convert to string as requests will mangle
        region_payload['req'] = json.dumps(
            self.interest_by_region_widget['request'])
        region_payload['token'] = self.interest_by_region_widget['token']
        region_payload['tz'] = self.tz

        # parse returned json
        req_json = self._get_data(
            url=TrendReq.INTEREST_BY_REGION_URL,
            method=TrendReq.GET_METHOD,
            trim_chars=5,
            params=region_payload,
        )
        df = pd.DataFrame(req_json['default']['geoMapData'])
        if (df.empty):
            return df

        # rename the column with the search keyword
        df = df[['geoName', 'value']].set_index(
            ['geoName']).sort_index()
        # split list columns into seperate ones, remove brackets and split on comma
        result_df = df['value'].apply(lambda x: pd.Series(
            str(x).replace('[', '').replace(']', '').split(',')))
        if inc_geo_code:
            result_df['geoCode'] = df['geoCode']

        # rename each column with its search term
        for idx, kw in enumerate(self.kw_list):
            result_df[kw] = result_df[idx].astype('int')
            del result_df[idx]

        return result_df



We implemented two functions to generate the the timeframes for the queries, one for months and one for weeks. Since weeks are a bit asynchronous with how they fall within different years the weeks implementation will include a few dates from before or after the year specified.

In [142]:
import datetime
import time

def getDateRangeFromWeek(p_year,p_week):

    firstdayofweek = datetime.datetime.strptime(f'{p_year}-W{int(p_week )- 1}-1', "%Y-W%W-%w").date()
    lastdayofweek = firstdayofweek + datetime.timedelta(days=6.9)
    return firstdayofweek, lastdayofweek

def getDateRangeFromMonth(p_year,p_month):

    firstdayofweek = datetime.datetime.strptime(f'{int(p_year)}-{int(p_month)}-15', "%Y-%m-%d").date()
    if int(p_month) == 12:
        lastdayofweek = datetime.datetime.strptime(f'{int(p_year)+1}-{1}-15', "%Y-%m-%d").date()
    else:
        lastdayofweek = datetime.datetime.strptime(f'{int(p_year)}-{int(p_month)+1}-15', "%Y-%m-%d").date()
    return firstdayofweek, lastdayofweek



Initiating class and testing that the implementation works

We choose sports as our reference term given our analysis carried out in the file 'Stable_trends_timeseries.ipynb'

In [144]:
pytrends = MyTrendReq(hl='en-US', tz=-60, timeout=(15,30), retries=3, backoff_factor=0.2)
keywords = ['/m/06ntj', '/m/03nlf2w'] #specify category ID listed below

# news /m/05jhg
# EV '/m/03nlf2w'
#movies /m/02vxn	
#weather /m/0866r
#sport /m/06ntj
#music /m/04rlf
#weather forecst /m/0jp7j

pytrends.build_payload(kw_list = keywords, geo="DK-84", timeframe= '2017-01-01T00 2017-01-08T00', gprop='')
df = pytrends.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=False)


In [145]:
# Function to append queries to dataframe loop through list of years and weeks
# Catch exceptions from the pytrends API when it fails print the exception
trends = pd.DataFrame()
DK_geos = {'Nordjylland':"DK-81", 'Hovedstaden':"DK-84", 'Midtjylland':"DK-82", 'Sjælland':"DK-85", 'Syddanmark':"DK-83"}
def get_trends(years, trends, interval = [i+1 for i in range(52)]):    
    pl_df = pd.DataFrame()
    for reg, key in DK_geos.items():
        pytrends.build_payload(kw_list = keywords, geo= key, timeframe= '2017-01-01T00 2017-01-08T00',)
        pl = pytrends.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=False)
        pl.index.name = 'City'
        pl.reset_index(inplace=True)
        pl['Region'] = reg
        pl_df = pl_df.append(pl)
    pl_df = pl_df[['City', 'Region']]
    if isinstance(years, int):
        years = [years]
    for year in years:    
        for week in interval:
            print(week)
            reg_df = pd.DataFrame()
            for reg, key in DK_geos.items():
                try:
                    firstdate, lastdate =  getDateRangeFromWeek(str(year),str(week))
                    inter = str(firstdate)+'T00 ' + str(lastdate)+'T00'
                    pytrends.build_payload(kw_list = keywords, geo= key, timeframe= inter)
                    df = pytrends.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=False)
                    df.index.name = 'City'
                    df.reset_index(inplace=True)
                    df['Region'] = reg
                    df['included'] = 1
                    reg_df = reg_df.append(df)
                except Exception as e:
                    print(repr(e))
                    df = pl_df[pl_df['Region']==reg]
                    df['included'] = 1
                    reg_df = reg_df.append(df)
                    continue
            df = pd.merge(pl_df, reg_df, how = 'left')
            df['interval'] = inter
            df['week'] = week
            df['year'] = year
            trends = trends.append(df) 
        time.sleep(60)
    return trends


The following block of code collects the Google Trends data. The time it takes to execute varies between the choice of keyterms. We first collect the data for all years between 2017 and 2020. Second we specify 2021 until week 45.

The API will sometimes throw the error 500, which seems to be a server-side error from Google. At least other users have encountered the same issue in which queries break.

If the API returns error code 429 it means that Google is stopping you from querying. Using a VPN solves this problem efficiently.

In [None]:
car_trends = get_trends([2017,2018,2019,2020], trends)
car_trends = get_trends([2021], car_trends, interval=[i+1 for i in range(48)]) #set to current week

In [149]:
car_trends = car_trends.rename(columns={"/m/03nlf2w": "Elbil", "/m/06ntj": "Sports"})
car_trends.to_csv('trends/sports_trends_f.csv')

In [2]:
trends_df = pd.read_csv('trends/sports_trends_f.csv')

names = ['City', 'Region', 'Sports', 'Elbil', 'included',
       'interval', 'week', 'year']

Vallensbæk_ind = trends_df[trends_df['City'] == 'Vallensbæk']['Unnamed: 0.1'].unique()
print(Vallensbæk_ind)

trends_df['City'][trends_df['Unnamed: 0.1'].isin(Vallensbæk_ind[[0,2]])] = 'Vallensbæk Strand' #Vallensbæk Strand was called Vallensbæk
trends_df = trends_df.drop(trends_df[trends_df['Unnamed: 0.1'].isin(Vallensbæk_ind[[2,3]])].index) #It also contained duplicates

trends_df = trends_df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1)

trends_df.to_csv('Datasets/src/trends_data.csv')


[90 91 92 93]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trends_df['City'][trends_df['Unnamed: 0.1'].isin(Vallensbæk_ind[[0,2]])] = 'Vallensbæk Strand' #Vallensbæk Strand was called Vallensbæk
