## Google Trends Related Queries Parser, Merger, and cleaner (B Component)

In [1]:
# ============================================================
# Google Trends Related Queries Parser, Merger, and cleaner (B Component)
# ------------------------------------------------------------
# This script parses, cleans and merges multiple Google Trends "Related Queries" (B1 and B2 sub-component)
# In order to prepare for tagging and use in model training
# text exports and produces a single merged dataset containing:
#   - Query text 
#   - Date
#   - keyword (search term)
#   
# Notes:
# - Final text is normalized using "hazm Normalizer"
# ============================================================
import os
from datetime import datetime
import pandas as pd
import hazm

# ============================================================
# Configuration
# ============================================================
# Directory containing related query files downloaded from Google trends
INPUT_DIR = "sample-related-query"

# final merged and cleaned file
output_file = "parsed-merged_cleaned_queries.csv"

# ============================================================
# A function for cleaning and standardizing text (B2 sub-component)
# ============================================================
normalizer = hazm.Normalizer()

def normalize_text(text: str) -> str:
    """Normalize Persian text using Hazm normalizer."""
    return normalizer.normalize(text.strip())

# ============================================================
# Storage for merged results
# ============================================================
merged_rows = []

# ============================================================
# Iterate over all relatedQueries files
# ============================================================
for filename in os.listdir(INPUT_DIR):

    if not filename.startswith("relatedQueries"):
        continue

    file_path = os.path.join(INPUT_DIR, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f.readlines()]

    # Skip empty or invalid files
    if len(lines) < 5:
        print(f"Skipped empty file: {filename}")
        continue

    # --------------------------------------------------------
    # Parse, clean, and standardize the date and extract keywords from line 2 (B1 and B2 sub-component)
    # Example:
    # "KeyWord: (1/1/20 - 7/29/24, Worldwide)"
    # --------------------------------------------------------
    meta_line = lines[1]

    # Extract keyword (text before colon)
    keyword = meta_line.split(":")[0].replace('"', '').strip()
    #keyword = normalize_text(keyword)

    # Extract scrape start date
    date_part = meta_line.split("(")[1].split("-")[0].strip()
    date_obj = datetime.strptime(date_part, "%m/%d/%y")
    scrape_date = date_obj.strftime("%Y-%m-%d")

    # --------------------------------------------------------
    # Parse query lines (B1 sub-component)
    # --------------------------------------------------------
    current_section = None

    for line in lines:

        # Ignore empty lines
        if not line:
            continue

        # Ignore headers and section markers
        if line in ["TOP", "RISING"]:
            current_section = line
            continue

        # Ignore metadata lines
        if ":" in line and "(" in line:
            continue

        # Ignore non-data lines before sections
        if current_section not in ["TOP", "RISING"]:
            continue

        # ----------------------------------------------------
        # cleaning and standardization QueryText (B2 sub-component)
        # ----------------------------------------------------
        query_text = line.split(",")[0]
        query_text = normalize_text(query_text)

        if not query_text:
            continue

        merged_rows.append({
            "Query": query_text,
            "Date": scrape_date,
            "Keyword": keyword
        })

    print(f"Parsed: {filename}")

# ============================================================
# Build final merged DataFrame (B1 sub-component)
# ============================================================
df = pd.DataFrame(merged_rows, columns=["Query", "Date", "Keyword"])

print("\nParsing completed.")
print("Total extracted queries:", len(df))

# ============================================================
# Save merged dataset (B1 sub-component)
# ============================================================
df.to_csv(output_file, index=False, encoding="utf-8-sig")

print(f"Merged dataset saved to: {output_file}")


Parsed: relatedQueries (11).csv
Parsed: relatedQueries (12).csv
Parsed: relatedQueries (13).csv
Parsed: relatedQueries (14).csv
Parsed: relatedQueries (15).csv

Parsing completed.
Total extracted queries: 221
Merged dataset saved to: parsed-merged_cleaned_queries.csv
