https://towardsdatascience.com/using-google-trends-at-scale-1c8b902b6bfa

In [10]:
!echo $CONDA_DEFAULT_ENV

base


In [11]:
!echo $CONDA_PREFIX


/home/jak/miniconda3/envs/ml4t_gpu


In [12]:
from calendar import monthrange
from datetime import timedelta, datetime, date
from functools import partial
from random import randrange, randint
from time import sleep

import pandas as pd
from pytrends.exceptions import ResponseError
from pytrends.request import TrendReq
import matplotlib.pyplot as plt


def get_timeframe_daily_resolution(start: date, stop: date) -> str:
    """Given two dates, returns a string representing the interval between the
    dates. This is string is used to retrieve data for a specific time frame
    from Google Trends.
    """
    return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"

def get_timeframe_hourly_resolution(start: datetime, stop: datetime) -> str:
    """Given two datetimes, returns a string representing the interval between the
    dates. This is string is used to retrieve data for a specific time frame
    from Google Trends.
    """
    return f"{start.strftime('%Y-%m-%dT%H')} {stop.strftime('%Y-%m-%dT%H')}"



def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
    """Attempts to fecth data and retries in case of a ResponseError."""
    attempts, fetched = 0, False
    if randint(0, 10) == 0:
        print("sleep 60 s")
        sleep(60)

    while not fetched:
        try:
            build_payload(timeframe=timeframe)
        except Exception:
            wait_time = 300 * attempts  # Start with 0 due to timeouts
            print(f'Trying again in {wait_time / 60:.0f} minutes.')
            sleep(wait_time)
            attempts += 1
        else:
            result = pytrends.interest_over_time()
            if result.empty:
                print("dataframe empty, sleep 5 min")
                attempts += 1
            else:
                fetched = True
    return pytrends.interest_over_time()


def get_hourly_data(search_term: str,
                    start_year: int = 2020,
                    start_month: int = 1,
                    start_day: int = 1,
                    stop_year: int = 2021,
                    stop_month: int = 12,
                    stop_day: int = 31,
                    geo: str = '',  # for worldwide aggregation
                    tz: int = 0,  # for utc!
                    cat: int = 0,
                    verbose: bool = True,
                    clean: bool = True,
                    wait_time: float = 5.0,
                    shift_hourly: bool = False) -> pd.DataFrame:
    """Given a search term, fetches daily search volume data from Google Trends and
    returns results in a pandas DataFrame.
    Details: Due to the way Google Trends scales and returns data, special
    care needs to be taken to make the daily data comparable over different
    months. To do that, we download daily data on a month by month basis,
    and also monthly data. The monthly data is downloaded in one go, so that
    the monthly values are comparable amongst themselves and can be used to
    scale the daily data. In a given month, the daily data is scaled so that
    the month by month average of daily values is equal to the values at the
    monthly frequency. That is, the daily data is scaled by multiplying the
    daily values by the ratio of the monthly series value to the monthly
    average of the daily data.

    Args:
        search_term (str): search_term to fetch daily data for.
        start_year (int): First year to fetch data for. Starts at the beginning
            of this year (1st of January).
        start_month (int): First month of the first year
        start_day (int): First day of the first year
        stop_year (int): Last year to fetch data for (inclusive).
        stop_month (int): Last month of the last year
        stop_day (int): Last day
        geo (str): Geographical area code. Default at 'US'.
        tz (int): Time zone, minutes offset off GMT (240 for US EST).
        cat (int): Category, default 0 for no category. Use trends.google.com and
        check the header for more information.
        verbose (bool): If True, then prints the word and current time frame
            we are fecthing the data for.
        clean (bool): If True, clean up the dataframe, else leave information for
            diagnostics
        wait_time (float): Scaling factor for how much to wait between data
            requests. If 0, then a new request is sent at about every 0.5
            second. The default of 5 seconds implies in a new request being
            sent at about every 3 seconds (random).
        shift_hourly (bool): shifts for 3 days which is useful for validation (see tests)
    Returns:
        complete (pd.DataFrame): Contains 4 columns.
            The column named after the word argument contains the daily search
            volume already scaled and comparable through time.
            The column f'{word}_{geo}_unscaled' is the original daily data
            fetched month by month, and it is not comparable across different
            months (but is comparable within a month).
            The column f'{word}_{geo}_monthly' contains the original monthly
            data fetched at once. The values in this column have been
            backfilled so that there are no NaN present.
            The column 'scale' contains the scale used to obtain the scaled
            daily data.
    """

    # Set up start and stop dates
    start_date = datetime(start_year, start_month, start_day)
    # stop_date cannot be later than today's date
    stop_date = min([datetime(stop_year, stop_month, stop_day), datetime.today()])

    # Start pytrends for US region
    pytrends = TrendReq(tz=tz)
    # Initialize build_payload with the search_term we need data for
    build_payload = partial(pytrends.build_payload, kw_list=[search_term], cat=cat, geo=geo, gprop='')


    daily = get_daily_gtrends(build_payload, geo, pytrends, search_term, start_date, stop_date, verbose, wait_time)
    monthly = get_monthly_gtrends(build_payload, pytrends, search_term, start_date, stop_date, verbose)

    daily['interest_daily_monthly_mean'] = daily['interest_daily_raw'].resample('M').mean()
    daily['interest_daily_monthly_mean'].bfill(inplace=True)  # Fill in backward because 'monthly' resampling is the
    # other way round
    monthly_daily_interest = daily.join(monthly)


    # fill NaN values
    monthly_daily_interest['interest_monthly'].ffill(inplace=True)
    # compute month_by_day_scale
    monthly_daily_interest['month_day_scale'] = monthly_daily_interest['interest_monthly'] / monthly_daily_interest['interest_daily_monthly_mean']
    monthly_daily_interest['interest_daily'] = monthly_daily_interest['interest_daily_raw'] * monthly_daily_interest.month_day_scale

    # hourly = get_hourly_gtrends(geo, pytrends, search_term, start_date, stop_date, verbose, wait_time)
    # Compute month by month averages of daily data
    """
    if shift_hourly:
        hourly = get_hourly_gtrends(build_payload, geo, pytrends, search_term, start_date-timedelta(days=3), stop_date, verbose, wait_time)
    else:
        hourly = get_hourly_gtrends(build_payload, geo, pytrends, search_term, start_date, stop_date, verbose, wait_time)

    hourly['interest_hourly_7D_mean'] = hourly['interest_hourly_raw'].resample('7D').mean()
    hourly['interest_hourly_7D_mean'].ffill(inplace=True)  # Fill in forward
    interest = hourly.join(monthly_daily_interest)

    # Compute 7D mean for hourly usage
    interest['interest_daily_7D_mean'] = interest['interest_daily'].resample('7D').mean()
    interest['interest_daily_7D_mean'].ffill(inplace=True)  # fill NaN values
    # Scale hourly data by 7-day weights so the data is comparable

    interest['daily_hourly_scaling'] = interest['interest_daily_7D_mean'] / interest['interest_hourly_7D_mean']
    interest['interest_hourly'] = interest['interest_hourly_raw'] * interest.daily_hourly_scaling

    interest['interest_monthly'].ffill(inplace=True)
    interest['interest_daily'].ffill(inplace=True)
    print('hourly mean {}'.format(interest['interest_hourly'].mean()))


    """

    if clean:
        cols_to_keep = ['interest_monthly',
                        'interest_daily',
                        #'interest_hourly'
                        ]
        interest = monthly_daily_interest.loc[:, cols_to_keep]

    print('monthly mean {}'.format(interest['interest_monthly'].mean()))
    print('daily mean {}'.format(interest['interest_daily'].mean()))

    interest['search_term'] = search_term
    interest['google_category'] = cat
    interest.set_index(['search_term', 'google_category'], inplace=True, append=True)
    # interest.stack(['search_term', 'google_category'])
    return interest


def get_hourly_gtrends(build_payload, geo, pytrends, search_term, start_date, stop_date, verbose, wait_time):
    # Get daily data, month by month
    results = []
    attempts = 0
    # If a timeout or too many requests error occur we need to adjust wait time
    current = start_date
    while current < stop_date:
        end_date = current + timedelta(days=7)

        timeframe = get_timeframe_hourly_resolution(current, end_date)
        if verbose:
            print(f'{search_term}/{geo}:{timeframe}')
        result = _fetch_data(pytrends, build_payload, timeframe)

        result.rename(columns={search_term: 'interest_hourly_raw',
                               'isPartial': 'hourly_isPartial'}, inplace=True)
        result = result['interest_hourly_raw'].apply(lambda x: max(x, 0.1))

        if verbose:
            print(result)
        results.append(result)
        current = end_date + timedelta(hours=1)

        # Don't go too fast or Google will send 429s
        sleep(randrange(10, round(10 * wait_time)) / 10)

    # Concatenate daily data into a single dataframe
    daily = pd.concat(results)
    return daily

"""
def get_hourly_gtrends_old(geo, pytrends, search_term, start_date, stop_date, verbose, wait_time):
    # Get hourly data, week by week
    results = []
    # If a timeout or too many requests error occur we need to adjust wait time
    current = start_date
    while current < stop_date:
        end_date = current + timedelta(days=7) - timedelta(hours=1)
        print(f'{search_term} {geo} : {current} to {end_date}')
        result = pytrends.get_historical_interest([search_term], year_start=current.year,
                                                  month_start=current.month,
                                                  day_start=current.day,
                                                  hour_start=current.hour,
                                                  year_end=end_date.year,
                                                  month_end=end_date.month,
                                                  day_end=end_date.day,
                                                  hour_end=end_date.hour,
                                                  cat=0, geo='', gprop='', sleep=0)
        result.rename(columns={search_term: 'interest_hourly_raw',
                               'isPartial': 'hourly_isPartial'}, inplace=True)

        results.append(result)
        if verbose:
            print(result)
        current = current + timedelta(days=7)
        # Don't go too fast or Google will send 429s
        sleep(randrange(10, round(10 * wait_time)) / 10)
    # Concatenate daily data into a single dataframe
    hourly = pd.concat(results)
    return hourly
"""

def get_daily_gtrends(build_payload, geo, pytrends, search_term, start_date, stop_date, verbose, wait_time):
    # Get daily data, month by month
    results = []
    # If a timeout or too many requests error occur we need to adjust wait time
    current = start_date
    while current < stop_date:
        lastDateOfMonth = datetime(current.year, current.month,
                                   monthrange(current.year, current.month)[1])
        timeframe = get_timeframe_daily_resolution(current, lastDateOfMonth)
        print(f'{search_term}/{geo}:{timeframe}')
        result = _fetch_data(pytrends, build_payload, timeframe)

        result.rename(columns={search_term: 'interest_daily_raw',
                               'isPartial': 'daily_isPartial'}, inplace=True)
        # result = result['interest_daily_raw'].apply(lambda x: max(x, 0.1))

        if verbose:
            print(result)
        if not result.empty:
            results.append(result)
            current = lastDateOfMonth + timedelta(days=1)

        # Don't go too fast or Google will send 429s
        sleep(randrange(10, round(10 * wait_time)) / 10)
    # Concatenate daily data into a single dataframe
    daily = pd.concat(results)
    return daily


def get_monthly_gtrends(build_payload, pytrends, search_term, start_date, stop_date, verbose):
    # Obtain monthly data for all months in years [2004, stop_year]
    monthly = _fetch_data(pytrends, build_payload,
                          get_timeframe_daily_resolution(datetime(2004, 1, 1), stop_date))[start_date:stop_date]
    monthly.rename(columns={search_term: 'interest_monthly',
                            'isPartial': 'monthly_isPartial'}, inplace=True)
    monthly = monthly['interest_monthly'].apply(lambda x: max(x, 0.1))
    if verbose:
        print(monthly)
    return monthly

In [None]:
# Get the coins and coin names and save google trend data for the words and categories 

In [13]:
DATA_STORE = '../data/crypto.h5'

with pd.HDFStore(DATA_STORE) as store:
    # todo insert market
    market = store['coingecko/top100/market']
    # cats = store['coingecko/top100/cats']
    # col_list = ['name', 'id']
    # market_cut = market.loc[:, col_list]
    prices = store['crypto/caggle/prices']

In [14]:
# get the list of symbols from prices data
prices_symbols = prices.index.get_level_values('symbol').unique()
prices_symbols, len(prices_symbols)


(Index(['btc', 'ltc', 'eth', 'etc', 'xmr', 'xrp', 'miota', 'eos', 'neo', 'trx',
        'dai', 'mtn', 'xlm', 'mkr', 'man', 'vet', 'xtz', 'bsv', 'usdt', 'usdc',
        'btt', 'atom', 'wbtc', 'okb', 'algo', 'ftt', 'doge', 'ada', 'dot',
        'ksm', 'uni', 'fil', 'sol', 'aave', 'avax', 'bch', 'link', 'luna'],
       dtype='object', name='symbol'),
 38)

In [15]:
def symbol_to_store_path(symbol):
    return 'crypto/gtrends/' + symbol + '_gtrend'

print(symbol_to_store_path('btc'))


crypto/gtrends/btc_gtrend


In [16]:
market


Unnamed: 0_level_0,id,market_cap,name,genesis_date,market_cap_rank,hashing_algorithm,coingecko_rank,coingecko_score,developer_score,community_score,liquidity_score,public_interest_score
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
btc,bitcoin,943219001348,Bitcoin,2009-01-03,1,SHA-256,2,79.223,98.887,65.771,100.158,0.0
eth,ethereum,444995246879,Ethereum,2015-07-30,2,Ethash,3,77.163,97.176,62.612,98.968,0.0
bnb,binancecoin,92441015166,Binance Coin,2017-07-08,3,,5,67.775,73.253,66.032,83.296,0.0
xrp,ripple,60573262757,XRP,,4,,9,65.306,71.120,54.187,87.663,0.0
usdt,tether,58147824102,Tether,,5,,151,41.650,0.000,10.676,107.153,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
rvn,ravencoin,1186666191,Ravencoin,2018-01-03,96,x16r,39,54.848,68.040,46.428,61.649,0.0
ar,arweave,1184818634,Arweave,,96,,672,25.543,0.000,9.027,42.365,0.0
tusd,true-usd,1177644257,TrueUSD,2018-03-05,98,,131,43.754,62.725,8.702,55.802,0.0
pax,paxos-standard,1172924271,Paxos Standard,,99,,174,39.457,47.971,8.271,52.646,0.0


In [17]:
ranked_market = market.sort_values(by='market_cap_rank')

with pd.HDFStore(DATA_STORE) as store:
    store_key = 'crypto/gtrends/google_trends_df'
    if store_key in store:
        google_trends_df = store[store_key]

        print(f"found {store_key}")
    else:
        google_trends_df = pd.DataFrame()
        print(f"{store_key} not found. ")



# Printing Name and AvgBill. In this case, "x" is a series with index of column names
for index, contents in ranked_market.iterrows():
    symbol = contents.name
    symbol_name = contents['name']
    print("symbol {}\nname {}".format(symbol, symbol_name))
    search_terms = [contents.name, contents['name']]
    category = 7 # finance
    if symbol in prices_symbols:
        for search_term in search_terms:
            if not google_trends_df.empty:
                print("google trends is not empty")
                if symbol in google_trends_df.index.get_level_values('symbol'):
                    print("found {} in gtrends_df".format(symbol))
                    if search_term in google_trends_df.index.get_level_values('search_term'):
                            print("found {} in gtrends_df".format(search_term))
                            if category in google_trends_df.index.get_level_values('google_category'):
                                print("found category {} in gtrends_df".format(category))
                                print("break")
                                break
            print("get data from google")
            # if not break due to already aquired data:
            google_trend = get_hourly_data(search_term, start_year=2016, stop_year=2021,
                                           cat=category, wait_time=10,)
            google_trend['symbol'] = symbol
            google_trend.set_index('symbol', inplace=True, append=True)

            print("resulting google_trend: \n{}".format(google_trend))

            google_trends_df = pd.concat([google_trends_df, google_trend])
            print("google_trends_df: \n{}".format(google_trends_df))


            with pd.HDFStore(DATA_STORE) as store:
                store.put(store_key, google_trends_df)

found crypto/gtrends/google_trends_df
symbol btc
name Bitcoin
google trends is not empty
found btc in gtrends_df
found btc in gtrends_df
found category 7 in gtrends_df
break
symbol eth
name Ethereum
google trends is not empty
found eth in gtrends_df
found eth in gtrends_df
found category 7 in gtrends_df
break
symbol bnb
name Binance Coin
symbol xrp
name XRP
google trends is not empty
get data from google
xrp/:2016-01-01 2016-01-31
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empty, sleep 5 min
dataframe empt

KeyboardInterrupt: 

# Tests for making a library

In [None]:
"""

google_trend = get_hourly_data('AMG Mercedes', start_year=2021, stop_year=2021, stop_month=2, stop_day=28,
                               # cat=7,
                               clean=False)
fig1, ax1 = plt.subplots(1, 1, figsize=(24, 16))

google_trend.ffill(inplace=True)

google_trend.plot(ax=ax1)
fig1.savefig('monthly_daily_interest.svg')


In [None]:
google_trend_shifted = get_hourly_data('AMG Mercedes', start_year=2021, stop_year=2021, stop_month=2, stop_day=28,
                               clean=False, shift_hourly=True)

google_trend_shifted

make sure trends are aligned by testing for the difference of a differently sampled daily period 
(already visually confirmed)



In [None]:
cols_keep = ['interest_daily', 'interest_hourly']
joined_gtrends = google_trend[cols_keep].join(google_trend_shifted[cols_keep]*1.01, rsuffix='_shifted')

fig, ax = plt.subplots(1,1, figsize=(32,20))
joined_gtrends.plot(ax=ax)
fig.savefig('shifted_validation.svg')

In [None]:
fig, ax1 = plt.subplots(1, 1, figsize=(24,16))

google_trend.plot(ax=ax1)
fig.savefig('monthly_daily_interest.svg')

In [None]:
google_trend

In [None]:
google_trend_shifted.tail(100)