<a href="https://colab.research.google.com/github/kzumreen/FoodTrendsPrediction/blob/main/pytrends_eda_notebook_info.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyTrends EDA

## Setup

In [1]:
#!pip install pytrends
#!pip install statsmodels
#!pip install plotly pycountry wordcloud
#!pip install pycountry pycountry_convert plotly

# Cell 1 — imports & palette
import pandas as pd
import numpy as np
from pytrends.request import TrendReq
import pycountry
import pycountry_convert as pc
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import plotly.express as px
import seaborn as sns
from datetime import datetime

plt.rcParams['figure.figsize'] = (14,5)
sns.set(style="whitegrid")

# Your color palette
colors = {
    'dubai-chocolate':'#3A1F04',
    'feta-pasta':'#B90E0A',
    'matcha':'#32612D',
    'airfryer':'#D2691E'
}
def trend_color(tid): 
    return colors.get(tid, '#1f77b4')

# normalize Trend_ID helper (same rules you specified)
import re
def normalize_trend_id(name: str) -> str:
    if pd.isna(name): return ''
    s = str(name).lower().strip()
    s = re.sub(r'[_\s]+','-', s)
    s = re.sub(r'[^a-z0-9\-]','', s)
    s = re.sub(r'-{2,}','-', s)
    return s.strip('-')

In [2]:
# Cell 2 — config
pytrends = TrendReq(hl='en-US', tz=360)  # adjust tz if you want

# Replace with your actual list of trend names (display names) if different
TREND_LIST = [
    "airfryer",
    "dubai chocolate",
    "feta pasta",
    "matcha"
]

# Timeframe: change as needed. Use 'today 5-y' or explicit "YYYY-MM-DD YYYY-MM-DD"
TIMEFRAME = "2019-01-01 2025-10-01"  

print("Trends to fetch:", TREND_LIST)
print("Timeframe:", TIMEFRAME)


Trends to fetch: ['airfryer', 'dubai chocolate', 'feta pasta', 'matcha']
Timeframe: 2019-01-01 2025-10-01


In [5]:
# Cell 3 — fetch interest_over_time (daily)
rows = []
for tn in TREND_LIST:
    try:
        pytrends.build_payload([tn], timeframe=TIMEFRAME, geo='')
        df_ot = pytrends.interest_over_time()
        if df_ot.empty:
            print(f"No time-series for {tn}")
            continue
        # df_ot indexed by Timestamp; column name is the original term
        for dt, r in df_ot.iterrows():
            val = r.get(tn, np.nan)
            rows.append({
                "Trend Name": tn,
                "Trend_ID": normalize_trend_id(tn),
                "Date": dt.date(),   # store as date for now
                "Region": "WORLD",
                "Interest": float(val) if not pd.isna(val) else np.nan
            })
    except Exception as e:
        print("Error fetching time-series for", tn, e)

df_time = pd.DataFrame(rows)
print("Time-series rows fetched:", len(df_time))
df_time.head()


  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)


Time-series rows fetched: 328


  df = df.fillna(False)


Unnamed: 0,Trend Name,Trend_ID,Date,Region,Interest
0,airfryer,airfryer,2019-01-01,WORLD,19.0
1,airfryer,airfryer,2019-02-01,WORLD,15.0
2,airfryer,airfryer,2019-03-01,WORLD,14.0
3,airfryer,airfryer,2019-04-01,WORLD,12.0
4,airfryer,airfryer,2019-05-01,WORLD,13.0


In [9]:
# Cell 4 — choose the countries you want region-level maps for
# Use ISO2 codes (pytrends expects geo as ISO2 for country-level region fetch)
# Common codes: US, IN, AE, KR, GB, AU, CA, JP, DE, FR
CUSTOM_COUNTRIES = {
    'US': 'United States',
    'IND': 'India',
    'UAE': 'United Arab Emirates',
    'SK': 'South Korea',
    'UK': 'United Kingdom',
    # add more if you like, e.g. 'AU':'Australia', 'CA':'Canada'
}

# How many top subregions to show in bar charts / heatmaps
TOP_N_REGIONS = 6

# US state name -> USPS abbreviation mapping (for choropleth)
US_STATE_ABBREV = {
 'Alabama':'AL','Alaska':'AK','Arizona':'AZ','Arkansas':'AR','California':'CA','Colorado':'CO','Connecticut':'CT',
 'Delaware':'DE','District of Columbia':'DC','Florida':'FL','Georgia':'GA','Hawaii':'HI','Idaho':'ID','Illinois':'IL',
 'Indiana':'IN','Iowa':'IA','Kansas':'KS','Kentucky':'KY','Louisiana':'LA','Maine':'ME','Maryland':'MD','Massachusetts':'MA',
 'Michigan':'MI','Minnesota':'MN','Mississippi':'MS','Missouri':'MO','Montana':'MT','Nebraska':'NE','Nevada':'NV','New Hampshire':'NH',
 'New Jersey':'NJ','New Mexico':'NM','New York':'NY','North Carolina':'NC','North Dakota':'ND','Ohio':'OH','Oklahoma':'OK',
 'Oregon':'OR','Pennsylvania':'PA','Rhode Island':'RI','South Carolina':'SC','South Dakota':'SD','Tennessee':'TN','Texas':'TX',
 'Utah':'UT','Vermont':'VT','Virginia':'VA','Washington':'WA','West Virginia':'WV','Wisconsin':'WI','Wyoming':'WY'
}

print("Custom countries selected:", CUSTOM_COUNTRIES)

Custom countries selected: {'US': 'United States', 'IND': 'India', 'UAE': 'United Arab Emirates', 'SK': 'South Korea', 'UK': 'United Kingdom'}


In [15]:
# Robust fetch for subregions (fixed TrendReq init)
import time
import random
from pytrends.request import TrendReq

# Correct TrendReq initialization:
# - timeout passed as timeout= (not inside requests_args)
# - headers inside requests_args only
pytrends = TrendReq(
    hl='en-US',
    tz=360,
    timeout=(10, 25),  # tuple allowed: (connect timeout, read timeout)
    requests_args={
        'headers': {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/115.0 Safari/537.36'
        }
    }
)

# parameters
PRIMARY_TIMEFRAME = TIMEFRAME  # from your earlier cell, e.g. "2019-01-01 2025-10-01"
FALLBACK_TIMEFRAME = 'today 12-m'   # usually accepted for region queries
MAX_RETRIES = 4
SLEEP_BETWEEN_CALLS = (1.0, 2.5)  # random sleep range in seconds

region_rows = []
failures = []

def safe_interest_by_region(trend_name, geo_iso2, timeframe):
    """Call interest_by_region with retries; return DataFrame or raise exception."""
    last_exc = None
    for attempt in range(1, MAX_RETRIES+1):
        try:
            pytrends.build_payload([trend_name], timeframe=timeframe, geo=geo_iso2)
            df_sub = pytrends.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=False)
            return df_sub
        except Exception as e:
            last_exc = e
            err_msg = str(e)
            print(f"   attempt {attempt} failed for {trend_name} in {geo_iso2}: {err_msg}")
            # exponential backoff + jitter
            sleep_time = min((2 ** (attempt-1)) + random.uniform(0.5, 1.5), 10)
            time.sleep(sleep_time)
    # all retries exhausted
    raise last_exc

print("Starting robust subregion fetch (fixed TrendReq)...")

for iso2, country_name in CUSTOM_COUNTRIES.items():
    print(f"\nFetching subregions for country: {country_name} ({iso2})")
    for tn in TREND_LIST:
        success = False
        # Try primary timeframe first, then fallback
        for timeframe_try in (PRIMARY_TIMEFRAME, FALLBACK_TIMEFRAME):
            try:
                df_sub = safe_interest_by_region(tn, iso2, timeframe_try)
                if df_sub is None or df_sub.empty:
                    # Received empty result; treat as failure for this timeframe
                    print(f"  empty response for {tn} in {country_name} with timeframe {timeframe_try}")
                    continue
                # success: collect rows
                for subregion_name, row in df_sub.iterrows():
                    val = row.get(tn, np.nan)
                    region_rows.append({
                        "Trend Name": tn,
                        "Trend_ID": normalize_trend_id(tn),
                        "Country_ISO2": iso2,
                        "Country_Name": country_name,
                        "Subregion": subregion_name,
                        "Date": pd.NaT,
                        "Interest": float(val) if not pd.isna(val) else np.nan,
                        "Timeframe_Used": timeframe_try
                    })
                print(f"  fetched {len(df_sub)} subregions for {tn} (timeframe={timeframe_try})")
                success = True
                # polite pause between trend calls
                time.sleep(random.uniform(*SLEEP_BETWEEN_CALLS))
                break
            except Exception as e:
                msg = str(e)
                print(f"  FAILED {tn} in {country_name} with timeframe {timeframe_try}: {msg}")
                # try next timeframe (fallback) or record failure later
                continue

        if not success:
            failures.append((iso2, country_name, tn))
            # small extra pause before next trend to reduce chance of blocks
            time.sleep(random.uniform(*SLEEP_BETWEEN_CALLS))

# Build DataFrame
df_subregions = pd.DataFrame(region_rows)
print("\nTotal subregion rows fetched:", len(df_subregions))
print("Total failures (country, trend) pairs:", len(failures))
if failures:
    print("Failures (sample up to 20):", failures[:20])

# Save results
out_csv = "/mnt/data/df_subregions_robust.csv"
df_subregions.to_csv(out_csv, index=False)
print("Saved df_subregions to", out_csv)


Starting robust subregion fetch (fixed TrendReq)...

Fetching subregions for country: United States (US)
  fetched 51 subregions for airfryer (timeframe=2019-01-01 2025-10-01)
  fetched 51 subregions for dubai chocolate (timeframe=2019-01-01 2025-10-01)
  fetched 51 subregions for feta pasta (timeframe=2019-01-01 2025-10-01)
  fetched 51 subregions for matcha (timeframe=2019-01-01 2025-10-01)

Fetching subregions for country: India (IND)
   attempt 1 failed for airfryer in IND: The request failed: Google returned a response with code 400
   attempt 2 failed for airfryer in IND: The request failed: Google returned a response with code 400
   attempt 3 failed for airfryer in IND: The request failed: Google returned a response with code 400
   attempt 4 failed for airfryer in IND: The request failed: Google returned a response with code 400
  FAILED airfryer in India with timeframe 2019-01-01 2025-10-01: The request failed: Google returned a response with code 400
   attempt 1 failed for 

OSError: Cannot save file into a non-existent directory: '\mnt\data'