In [1]:
#installing dependancies
!pip install -q pyspark findspark 

In [2]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession #initialise spark

> **⚠️ Prerequisite Notice**  
> This notebook uses Apache Spark via PySpark.  
> Please ensure you have **Java (JDK 8 or 11)** installed and properly configured on your syste Make sure the `JAVA_HOME` environment variable is set correctly and that `java.exe` is accessible in your system PATH.H.
>  
> Without Java, the Spark session will fail to initiale.



In [3]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Search Trends Analysis") \
    .getOrCreate()

In [4]:
#clone the GitHub repo
!git clone https://github.com/jademene/search-trends-vs-financial-markets.git

fatal: destination path 'search-trends-vs-financial-markets' already exists and is not an empty directory.


# Google Trends keywords SVI data

In [18]:
#read CSV using Spark
df_kw = spark.read.csv('search-trends-vs-financial-markets/Collected Data/carrefour_search_trends_keywords.csv', header=True, inferSchema=True) #inferSchema added to automatically type inference for columns
df_agg = spark.read.csv('search-trends-vs-financial-markets/Collected Data/carrefour_search_trends_aggregated.csv', header=True, inferSchema=True)

#### Google Trends keywords

In [6]:
#show preview keywords
df_kw.show(5)
df_kw.printSchema()

+----------+--------+-------+---------+-----------------------+-----------------+---------------+-------------------+----------------------------+------------------------------+---------------+--------------+-----------------+----------------+---------------+----+---------------------+-------------------+--------------------------+------------------+-------------------+---------------------+-------------+------------------+------------------+---------------+-------------------+------------------------+------------------------+---------------+--------------------+------+----------------+---------+------------+-------+----------------+----------------+---------------------+----+-------+----------------+--------------------+-------------------+----------------+-------------------+--------------+------------------------+
|      date|carefour|carfour|carrefour|carrefour autour de moi|carrefour near me|carrefour drive|carrefour livraison|carrefour livraison domicile|carrefour livraison Ã dom

There are several columns that only have 0 values, let's drop them and keep a list of these.

In [7]:
from pyspark.sql.functions import col, sum as _sum

In [8]:
# Exclude 'date' from check
columns_to_check = [c for c in df_kw.columns if c != 'date']

In [9]:
kept_cols = []
dropped_cols = []

# Loop through numeric columns only
for c in columns_to_check:
    try:
        col_sum = df_kw.select(_sum(col(f"`{c}`"))).collect()[0][0]     # Use backticks in col() to safely reference columns like `E.Leclerc`
        if col_sum == 0 or col_sum is None:
            dropped_cols.append(c)
        else:
            kept_cols.append(c)
    except Exception as e:
        print(f"Skipping column '{c}' due to error: {e}")
        dropped_cols.append(c)

In [10]:
# Keep 'date' and valid keyword columns
final_cols = ['date'] + kept_cols

# Select cleaned/filtered DataFrame
df_kw_cleaned = df_kw.select(*[col(c) if c == 'date' else col(f"`{c}`") for c in final_cols])

In [19]:
# Show results
print("Dropped columns (all values were 0 or null):")
print(dropped_cols)

Dropped columns (all values were 0 or null):
['carfour', 'carrefour near me', 'IntermarchÃ©', 'carrefour bourse']


✅ Dropped columns (all values were 0 or null):

`carfour`, `carrefour near me`, `IntermarchÃ©`, `carrefour bourse`.

In [12]:
df_kw_cleaned.show(3)

+----------+--------+---------+-----------------------+---------------+-------------------+----------------------------+------------------------------+---------------+--------------+-----------------+----------------+---------------+----+---------------------+-------------------+--------------------------+------------------+-------------------+---------------------+-------------+------------------+------------------+---------------+-------------------+------------------------+------------------------+---------------+--------------------+------+----------------+---------+-------+----------------+---------------------+----+-------+----------------+--------------------+-------------------+----------------+-------------------+--------------+------------------------+
|      date|carefour|carrefour|carrefour autour de moi|carrefour drive|carrefour livraison|carrefour livraison domicile|carrefour livraison Ã domicile|drive carrefour|carrefour city|carrefour express|carrefour market|carrefour 

#### Google Trends keywords aggregated

Based on search intent, keywords can be aggregated as follows:

> ⚠️ keywords marked as ~~keywords~~ are those previously identified as "to drop"

| Aggregate | Keywords | Justification |
|---|---|---|
| Brand | carrefour, carrefour autour de moi, ~~carrefour near me~~, ~~carfour~~, carefour | Serves as an anchor term to capture general brand interest and visibility. |
| Service and logistics | carrefour drive, drive carrefour, carrefour livraison, carrefour livraison domicile, carrefour livraison Ã domicile | Reflects consumer demand for fulfillment services such as click-and-collect and home delivery, indicating operational engagement. |
| Sub-brand | carrefour market, carrefour city, carrefour express, cora | Provides more granular insight into Carrefour’s diversified retail formats and regional presence. |
| Promo and engagement | carrefour promo, carrefour code promo drive, carrefour catalogue, catalogue carrefour, carrefour fidelite, bon d'achat carrefour | Captures interest in promotions, loyalty programs, and catalogues—key drivers of footfall and conversion in price-sensitive FMCG segments. |
| FMCG products | carrefour produits, carrefour alimentaire, carrefour epicerie, carrefour bio, pizza carrefour, foire aux vins carrefour, ongle carrefour, franck provost carrefour, parfumerie carrefour, aloe vera carrefour | Reflects consumer preferences for specific product categories; interest in organic and beauty items may indicate evolving lifestyle and sustainability trends. |
| Competitors | Auchan, Auchan catalogue, E.Leclerc, leclerc, ~~IntermarchÃ©~~, lidl, super u  | Rising interest in competing retailers may signal market share shifts or influence investor sentiment regarding Carrefour. |
| Finance | ~~carrefour bourse~~, carrefour recrutement, carrefour credit, carrefour assurance, action carrefour, carrefour banque, carrefour anti crise | Indicates public engagement with Carrefour’s financial operations, job market relevance, and economic resilience. |
| News  | fermeture carrefour, rappel produit carrefour, cora carrefour | Tracks external news-driven factors, including store closures and product recalls, which may impact consumer trust or financial outlook. |


In [None]:
#show preview keywords
df_agg.show(5)
df_agg.printSchema()

The aggregation of the SVI data in the `carrefour_search_trends_aggregated.csv` was initially performed at the data collection stage in the following manner: 
- one column for each aggregated category (as defined in the table above);
- each row represents a time period with weekly frequency;
- each value is the sum of the SVI values for all keywords belonging to that category at that point in time.

However, this initial aggregation method is methodologically unsatisfactory. In Google Trends each keyword's SVI is scaled individually (where 0 represent the lowest relative search interest and 100 the peak relative interest within the time range selected), therefore, summing across keywords combines values on different scales. As such we risk overemphasising categories with more keywords and introduce bias if some keywords exhibit greater volatility than others.

To enhance the interpretability and methodological robustness of the analysis, alternative aggregation techniques should be considered:

| Aggregation Method   | Analysis   | 
|:---|:---|
| Simple mean | Assigns equal weight to all keywords, avoids keyword-count bias, and is easy to interpret. |
| Weighted mean | Offers higher accuracy if reliable weights (e.g., based on historical correlation or relevance) are available. |
| Z-score normalised mean | Standardises keyword volatility and expresses interest relative to each keyword’s historical mean. |
| Median | More robust to outliers and episodic spikes, especially useful with erratic or sparse search data. |
| Principal component aggregation (PCA) | Extracts the dominant shared pattern across keywords, ideal when a common driver is expected. |
| Maximum value (peak interest) | Highlights the most significant surge in attention per period, suitable for tracking event-driven spikes. |
| Frequency-based binary aggregation | Converts SVIs into binary indicators (e.g., 1 if above threshold), capturing the breadth of search interest per category. |

Using the mean as the  aggregation method is likely to be the most appropriate option as it mitigates the bias of differing keywords counts whilst also offering an intuitive measure of category-level search interest.

# Carrefour France stock data
This data was collected in daily and weekly frequency.

In [13]:
#read CSV using Spark
df_fin = spark.read.csv('search-trends-vs-financial-markets/Collected Data/carrefour_stock_data.csv', header=True, inferSchema=True)
df_wfin = spark.read.csv('search-trends-vs-financial-markets/Collected Data/carrefour_stock_weekly.csv', header=True, inferSchema=True)

#### Daily stocks

In [17]:
#show preview daily stocks
df_fin.show(5)
df_fin.printSchema()

+-------------------+------------------+------------------+------------------+------------------+-------+---------+------------+
|               Date|              Open|              High|               Low|             Close| Volume|Dividends|Stock Splits|
+-------------------+------------------+------------------+------------------+------------------+-------+---------+------------+
|2022-01-03 00:00:00|14.412475049746167| 14.45706742372177|14.251939781689956|14.376801490783691|1539521|      0.0|         0.0|
|2022-01-04 00:00:00|14.452608499187328|14.639900207519531|14.381259681986961|14.639900207519531|1921444|      0.0|         0.0|
|2022-01-05 00:00:00|14.622063598058403|15.607569735983143|14.586388336424337|15.384604454040527|7593876|      0.0|         0.0|
|2022-01-06 00:00:00| 15.78594174663718|16.490511113047717|15.607568850763876|16.352272033691406|9738108|      0.0|         0.0|
|2022-01-07 00:00:00| 16.31213981324636|16.624289827380004|16.160523363057166| 16.37010955810547|

#### Weekly stocks

In [15]:
#show preview stocks
df_wfin.show(5)
df_wfin.printSchema()

+-------------------+------------------+------------------+------------------+------------------+--------+---------+------------+
|               Date|              Open|              High|               Low|             Close|  Volume|Dividends|Stock Splits|
+-------------------+------------------+------------------+------------------+------------------+--------+---------+------------+
|2022-01-03 00:00:00|  14.4124747784588|16.624289827380004| 14.25193951342436| 16.37010955810547|25331076|      0.0|         0.0|
|2022-01-10 00:00:00| 16.33889431434613| 16.45929576133611| 15.57189325993142| 15.75472640991211|18968388|      0.0|         0.0|
|2022-01-17 00:00:00|15.821616593224354|16.089175958816803|15.384604833955846|15.661081314086914|16451615|      0.0|         0.0|
|2022-01-24 00:00:00|15.696755275388963|16.098091738872853|15.215149477900288|  15.9375581741333|19400666|      0.0|         0.0|
|2022-01-31 00:00:00| 15.81269534575529|15.843910345110322|14.604224081896566|14.956509590

### The time issue. 
When looking at the data collected we notice a discrepancy in date alignment. To allow the comparison of data we will need all the dataframes to share a common timeframe with matching entries.