In [None]:
#installing dependencies
!pip install -q pyspark findspark 
#!pip install pandas
#!pip install seaborn

In [None]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession #initialise spark
from pyspark.sql.functions import col, sum as _sum

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import colormaps
import math

import seaborn as sns

> **⚠️ Prerequisite Notice**  
> This notebook uses Apache Spark via PySpark.  
> Please ensure you have **Java (JDK 8 or 11)** installed and properly configured on your syste Make sure the `JAVA_HOME` environment variable is set correctly and that `java.exe` is accessible in your system PATH.H.
>  
> Without Java, the Spark session will fail to initiale.



In [None]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Search Trends Analysis") \
    .getOrCreate()

While the dataset used in this thesis is relatively small and could have been handled entirely within Python's Pandas framework, PySpark was initially selected for its scalability and potential for distributed data processing.

# Google Trends keywords SVI data

In [None]:
#read CSV using Spark
df_kw = spark.read.csv('search-trends-vs-financial-markets/Collected Data/carrefour_search_trends_keywords.csv', header=True, inferSchema=True) #inferSchema added to automatically type inference for columns
df_agg = spark.read.csv('search-trends-vs-financial-markets/Collected Data/carrefour_search_trends_aggregated.csv', header=True, inferSchema=True)

## Google Trends keywords

In [None]:
#show preview keywords
df_kw.show(3)
df_kw.printSchema()

Several columns appear to have only zero values; let's drop them and keep a list of these.

In [None]:
# Exclude 'date' from check
columns_to_check = [c for c in df_kw.columns if c != 'date']

In [None]:
#Drop columns (keywords) with only zero values in their cells
kept_cols = []
dropped_cols = []

# Loop through numeric columns only
for c in columns_to_check:
    try:
        col_sum = df_kw.select(_sum(col(f"`{c}`"))).collect()[0][0]     #backticks in col() to safely reference columns like `E.Leclerc`
        if col_sum == 0 or col_sum is None:
            dropped_cols.append(c)
        else:
            kept_cols.append(c)
    except Exception as e:
        print(f"Skipping column '{c}' due to error: {e}")
        dropped_cols.append(c)

In [None]:
# Keep 'date' and valid keyword columns
final_cols = ['date'] + kept_cols

# Select cleaned/filtered DataFrame
df_kw_cleaned = df_kw.select(*[col(c) if c == 'date' else col(f"`{c}`") for c in final_cols])

In [None]:
# Show results
print("Dropped columns (all values were 0 or null):")
print(dropped_cols)

✅ Dropped columns (all values were 0 or null):

`carfour`, `carrefour near me`, `IntermarchÃ©`, `carrefour bourse`.

In [None]:
df_kw_cleaned.show(3)

### EDA

In [None]:
# Convert PySpark DataFrame to Pandas
df_kw_pd = df_kw_cleaned.toPandas()

In [None]:
df_kw_pd.shape

#### 1. Date Handling & Time Index Setup

The `date` is currently in ISO 8601 standard, `yyyy-mm-dd`, making it directly compatible with pandas and the libraries we will use.

However, we will convert the date to a Datetime object to fully utilise the time series functionalities. Lastly, we will set it as the DataFrame index, which converts the DataFrame into a time series for slicing, plotting, and modelling.

In [None]:
df_kw_pd['date'] = pd.to_datetime(df_kw_pd['date'])
df_kw_pd.set_index('date', inplace=True)
df_kw_pd.sort_index(inplace=True) #sorts data chronologically from earliest to latest data

#### 2. Time Series Grid of Keywords

In [None]:
# Set up subplot grid
n_keywords = len(df_kw_pd.columns)
n_cols = 4
n_rows = math.ceil(n_keywords / n_cols)

In [None]:
#plot grid
plt.figure(figsize=(n_cols * 4, n_rows * 3))

for i, keyword in enumerate(df_kw_pd.columns):
    plt.subplot(n_rows, n_cols, i + 1)
    plt.plot(df_kw_pd.index, df_kw_pd[keyword], color='teal')
    plt.title(keyword, fontsize=10)
    plt.xticks(rotation=45)
    plt.tight_layout()

plt.suptitle("SVI Trends by Keyword", fontsize=16, y=1.02)
plt.show()


As expected based on the literature review, we observe considerable variability in search popularity over time for most keywords, with episodic peaks in search interest. We can also see the sudden rise in popularity of some keywords over time and the decline of others.

#### 3. Distribution Plot of Interest Scores

In [None]:
df_kw_pd.plot(kind='box', vert=False, figsize=(10, 12), title='SVI Distribution')
plt.title('SVI Distribution', fontweight='bold')
plt.tick_params(axis='x', which='both', labeltop=True)
plt.grid(axis='x', linestyle=':', linewidth=0.7)
plt.xticks(np.arange(0, 110, 10))
plt.tight_layout()
plt.show()

#### 4. Statistical Summary

In [None]:
summary = df_kw_pd.describe().T

In [None]:
summary["range"] = summary["max"] - summary["min"]
summary["iqr"] = summary["75%"] - summary["25%"] #interquartile range
summary["skew"] = df_kw_pd.skew()
summary["kurtosis"] = df_kw_pd.kurtosis()
summary["volatility (std/mean)"] = summary["std"] / summary["mean"]

In [None]:
summary

##### Mean
A few keywords stand above the others with a mean above 60, indicating dominant and sustained search behaviour: 
"carrefour banque" ~73, "lidl" ~68, "carrefour" ~64, "catalogue carrefour"	~62, "carrefour catalogue"	~62, "drive carrefour"	~61, "carrefour drive"	~61

##### Standard Deviation
With a sigma above ~17, the following keywords showcase the highest volatility in search patterns: "catalogue carrefour", "carrefour catalogue", "cora", "foire aux vins carrefour".

##### Range (max-min)
Several FMCG-related and brand name keywords are acarcterised with wide fluctuations in attention (range above 50): "foire aux vins carrefour", "pizza carrefour", "carrefour", "cora carrefour", "cora", "carrefour market", "leclerc".

##### Skewness and Kurtosis
A few keywords have a negative skew, with only "Auchan catalogue" having a skewness above -1.0. Interestingly, "carrefour livraison Ã domicile", "rappel produit carrefour", "E.Leclerc" and "carrefour recrutement" have a significant positive skeweness (all above +5.0), indicating a low search interest with occasional spikes in interest. These same keywords are also the ones with the highest kurtosis, suggesting strong event-driven behaviour. Based on the meaning of the keywords, we can see that this is possibly related to news or exceptional occasions ("rappel produit carrefour" and "E.Leclerc") or seasonal events ("carrefour recrutement").

##### Volatility (std/mean)
Based on the data, we can set the volatility thresholds as follows:
* below 0.20 Low: most of these keywords are brand equity
* 0.20 – 0.50 Medium: keywords here seem to be related to FMCG sales cycles
* 0.50 – 1.0 High: keywords in this group are possibly linked to events as they have high variance in consumer interest.
* above 1.0 Very High: these keywords might be helpful for short-term forecasting or anomaly analysis.

#### 5. Missing Values Analysis
Check how much missing and flat data there is with visuals.

In [None]:
missing = df_kw_pd.isnull().sum()
missing = missing[missing > 0]

if not missing.empty:
    print("Columns with missing values:")
    print(missing)
else:
    print("No missing values found.")


In [None]:
zeros = (df_kw_pd == 0).sum()
zeros = zeros[zeros > 0]

if not zeros.empty:
    print("Number of zeros in columns:")
    print(zeros)
else:
    print("No columns with zeros values found.")


Keywords with many zero values might have episodic or accidental search interest; these will be monitored throughout the rest of the EDA.

#### 6. Keyword Correlation Matrix

In [None]:
corr = df_kw_pd.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(35, 20))
sns.heatmap(corr, 
            mask=mask, 
            cmap='coolwarm', 
            center=0, 
            linewidths=0.5, 
            annot=True)
plt.title("Keywords Correlation Matrix")
plt.tight_layout()
plt.show()

In [None]:
#categorise keyword pairs
corr_pairs = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
corr_pairs = corr_pairs.stack().reset_index()
corr_pairs.columns = ['Keyword1', 'Keyword2', 'Correlation']

def categorize_corr(value):
    if value >= 0.8:
        return 'High Positive'
    elif value <= -0.8:
        return 'High Negative'
    elif value >= 0.4:
        return 'Average Positive'
    elif value <= -0.4:
        return 'Average Negative'
    elif value >= 0.0:
        return 'Low Positive'
    elif value > -0.4:
        return 'Low Negative'
    else:
        value
        return 'Negative'

corr_pairs['Category'] = corr_pairs['Correlation'].apply(categorize_corr)

##### HIGH CORRELATION PAIRS

In [None]:
high_corr = corr_pairs[corr_pairs['Category'].isin(['High Positive', 'High Negative'])]
print(high_corr.to_string(index=False))

Highly correlated keywords are all positive indicating a strong linear positive correlation.

* Carrefour and its sub-brands/services show consistently high correlations, suggesting that interest in Carrefour as a brand is strongly tied to its different offerings.
* Consumers frequently search promotions or catalogues in conjunction with drive-related services.
* Strong correlation between Carrefour and competitors, suggesting that users often compare multiple grocery retailers in the same session or buying cycle.
* High correlation between localised formats, confirming the literature as of France's interest in urban and convenience-oriented store formats

##### AVERAGE CORRELATION PAIRS

Reflect moderately aligned but differentiated consumer behaviours.

In [None]:
avg_corr = corr_pairs[corr_pairs['Category'].isin(['Average Positive', 'Average Negative'])]
print(avg_corr.to_string(index=False))

##### LOW CORRELATION PAIRS

In [None]:
low_corr = corr_pairs[corr_pairs['Category'].isin(['Low Positive', 'Low Negative'])]
print(low_corr.to_string(index=False))

**Positive Low Correlations**
* Product-specific keywords (aloe vera, pizza, ongle carrefour) often exhibit isolated behaviours, hinting at niche shopping intent or product-specific campaigns.

**Negative Correlations**
* These pairs indicate weak or diverging search behaviour, which may suggest: niche interest, separate consumer journeys, misalignment in search intent or timing.
* `carefour` (misspelt) has several negative correlations (`carrefour drive` (-0.35), `carrefour promo` (-0.20), `carrefour catalogue` (-0.36)), suggesting noise or irrelevant intent behind this keyword.

##### Conclusions
Based on the descriptive analysis, count of zeros in the columns, and correlation matrix, we can state the following for our analysis:

* `carefour`: Highly noisy and negatively correlated with most Carrefour terms, it will be removed as it may not be relevant. 
* `carrefour livraison à domicile`: Sparse and episodic search behaviour, will be aggregated with `carrefour livraison domicile`.
* `aloe vera carrefour`: Niche product search, best to aggregate it with other FMCG keywords.
* `E.Leclerc`: inconsistent sample, best to remove as there is already the `lecerc` keyword fulfilling the same search intent.
* `carrefour recrutement`: Episodic search, while it possibly follows a seasonal recruitment pattern, it has too little data to provide insight fully.
* `rappel produit carrefour`: Event-driven; behaves independently from regular consumer patterns.

In [None]:
#drop desired keywords
df_kw_pd = df_kw_pd.drop(columns=["carefour", "E.Leclerc", "carrefour recrutement"])

In [None]:
df_kw_pd.shape

## Google Trends keywords aggregated

Based on search intent, keywords can be aggregated as follows:

> ⚠️ keywords marked as ~~keywords~~ are those dropped after the initial EDA

| Aggregate | Keywords | Justification |
|---|---|---|
| Brand | carrefour, carrefour autour de moi, ~~carrefour near me~~, ~~carfour~~, ~~carefour~~ | Serves as an anchor term to capture general brand interest and visibility. |
| Service and logistics | carrefour drive, drive carrefour, carrefour livraison, carrefour livraison domicile, carrefour livraison Ã domicile | Reflects consumer demand for fulfillment services such as click-and-collect and home delivery, indicating operational engagement. |
| Sub-brand | carrefour market, carrefour city, carrefour express, cora | Provides more granular insight into Carrefour’s diversified retail formats and regional presence. |
| Promo and engagement | carrefour promo, carrefour code promo drive, carrefour catalogue, catalogue carrefour, carrefour fidelite, bon d'achat carrefour | Captures interest in promotions, loyalty programs, and catalogues—key drivers of footfall and conversion in price-sensitive FMCG segments. |
| FMCG products | carrefour produits, carrefour alimentaire, carrefour epicerie, carrefour bio, pizza carrefour, foire aux vins carrefour, ongle carrefour, franck provost carrefour, parfumerie carrefour, aloe vera carrefour | Reflects consumer preferences for specific product categories; interest in organic and beauty items may indicate evolving lifestyle and sustainability trends. |
| Competitors | Auchan, Auchan catalogue, ~~E.Leclerc~~, leclerc, ~~IntermarchÃ©~~, lidl, super u  | Rising interest in competing retailers may signal market share shifts or influence investor sentiment regarding Carrefour. |
| Finance | ~~carrefour bourse~~, ~~carrefour recrutement~~, carrefour credit, carrefour assurance, action carrefour, carrefour banque, carrefour anti crise | Indicates public engagement with Carrefour’s financial operations, job market relevance, and economic resilience. |
| News  | fermeture carrefour, rappel produit carrefour, cora carrefour | Tracks external news-driven factors, including store closures and product recalls, which may impact consumer trust or financial outlook. |

In [None]:
#show preview keywords
df_agg.show(5)
df_agg.printSchema()

In [None]:
# Convert PySpark DataFrame to Pandas
df_agg_pd_ = df_agg.toPandas()

In [None]:
# Ensure 'date' column is datetime and set it as the index
df_agg_pd_['date'] = pd.to_datetime(df_agg_pd_['date'], dayfirst=True)
df_agg_pd_.set_index('date', inplace=True)

In [None]:
# Plotting all keywords 
plt.figure(figsize=(16, 8))

for keyword in df_agg_pd_.columns:
    plt.plot(df_agg_pd_.index, df_agg_pd_[keyword], label=keyword, linewidth=2)

plt.title("Search Interest Over Time by Aggregated Keywords", fontsize=14)
plt.xlabel("Date")
plt.ylabel("Google SVI (0–100)")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
plt.tight_layout()
plt.grid(True)
plt.show()

The aggregation of the SVI data in the `carrefour_search_trends_aggregated.csv` was initially performed at the data collection stage in the following manner: 
- one column for each aggregated category (as defined in the table above);
- each row represents a time period with weekly frequency;
- each value is the sum of the SVI values for all keywords belonging to that category at that point in time.

However, this initial aggregation method is methodologically unsatisfactory. In Google Trends each keyword's SVI is scaled individually (where 0 represent the lowest relative search interest and 100 the peak relative interest within the time range selected), therefore, summing across keywords combines values on different scales. As such we risk overemphasising categories with more keywords and introduce bias if some keywords exhibit greater volatility than others.

To enhance the interpretability and methodological robustness of the analysis, alternative aggregation techniques should be considered:

| Aggregation Method   | Analysis   | 
|:---|:---|
| Simple mean | Assigns equal weight to all keywords, avoids keyword-count bias, and is easy to interpret. |
| Weighted mean | Offers higher accuracy if reliable weights (e.g., based on historical correlation or relevance) are available. |
| Z-score normalised mean | Standardises keyword volatility and expresses interest relative to each keyword’s historical mean. |
| Median | More robust to outliers and episodic spikes, especially useful with erratic or sparse search data. |
| Principal component aggregation (PCA) | Extracts the dominant shared pattern across keywords, ideal when a common driver is expected. |
| Maximum value (peak interest) | Highlights the most significant surge in attention per period, suitable for tracking event-driven spikes. |
| Frequency-based binary aggregation | Converts SVIs into binary indicators (e.g., 1 if above threshold), capturing the breadth of search interest per category. |

Using the mean as the aggregation method is likely the most appropriate option, as it mitigates the bias of differing keyword counts while also offering an intuitive measure of category-level search interest. However, to avoid weight bias, keywords with almost equal search intent will be aggregated separately first; these are:
* "carrefour drive", "drive carrefour"
* "carrefour livraison", "carrefour livraison domicile", "carrefour livraison Ã domicile"
* "carrefour promo", "carrefour code promo drive"
* "carrefour catalogue", "catalogue carrefour"
* "Auchan", "Auchan catalogue"

In [None]:
# Averaging specified keywords and creating new merged columns using df_kw_pd
df_kw_pd["c_drive"] = df_kw_pd[["carrefour drive", "drive carrefour"]].mean(axis=1)
df_kw_pd["c_livraison"] = df_kw_pd[["carrefour livraison", "carrefour livraison domicile", "carrefour livraison Ã domicile"]].mean(axis=1)
df_kw_pd["promo"] = df_kw_pd[["carrefour promo", "carrefour code promo drive"]].mean(axis=1)
df_kw_pd["catalogue"] = df_kw_pd[["carrefour catalogue", "catalogue carrefour"]].mean(axis=1)
df_kw_pd["auchan"] = df_kw_pd[["Auchan", "Auchan catalogue"]].mean(axis=1)

In [None]:
#dropping old keywords now merged
df_agg_pd = df_kw_pd.drop(columns=[
    "carrefour drive", "drive carrefour",
    "carrefour livraison", "carrefour livraison domicile", "carrefour livraison Ã domicile",
    "carrefour promo", "carrefour code promo drive",
    "carrefour catalogue", "catalogue carrefour",
    "Auchan", "Auchan catalogue"
])

In [None]:
df_agg_pd.shape

In [None]:
df_agg_pd.columns 

In [None]:
#aggregating variables based on category
df_agg_pd["brand"] = df_kw_pd[["carrefour", "carrefour autour de moi"]].mean(axis=1)
df_agg_pd["service"] = df_kw_pd[["c_drive", "c_livraison"]].mean(axis=1)
df_agg_pd["sub-brand"] = df_kw_pd[["carrefour market", "carrefour city", "carrefour express", "cora"]].mean(axis=1)
df_agg_pd["promo"] = df_kw_pd[["promo", "catalogue", "carrefour fidelite", "bon d'achat carrefour"]].mean(axis=1)
df_agg_pd["fmcg"] = df_kw_pd[["carrefour produits", "carrefour alimentaire", "carrefour epicerie", "carrefour bio", "pizza carrefour", "foire aux vins carrefour", "ongle carrefour", "franck provost carrefour", "parfumerie carrefour", "aloe vera carrefour"]].mean(axis=1)
df_agg_pd["competitors"] = df_kw_pd[["auchan", "leclerc", "lidl", "super u"]].mean(axis=1)
df_agg_pd["finance"] = df_kw_pd[["carrefour credit", "carrefour assurance", "action carrefour", "carrefour banque", "carrefour anti crise"]].mean(axis=1)
df_agg_pd["news"] = df_kw_pd[["fermeture carrefour", "rappel produit carrefour", "cora carrefour"]].mean(axis=1)

In [None]:
#dropping old keywords now merged
df_agg_final = df_agg_pd.drop(columns=[
    "carrefour", "carrefour autour de moi", "carrefour city",
    "carrefour express", "carrefour market", "cora", "bon d'achat carrefour",
    "carrefour fidelite", "carrefour alimentaire", "carrefour bio", "carrefour epicerie", "carrefour produits",
    "pizza carrefour", "aloe vera carrefour", "foire aux vins carrefour",
    "franck provost carrefour", "ongle carrefour", "parfumerie carrefour",
    "leclerc", "carrefour credit", "lidl", "super u", "action carrefour",
    "carrefour anti crise", "carrefour assurance", "carrefour banque",
    "fermeture carrefour", "cora carrefour", "rappel produit carrefour",
    "c_drive", "c_livraison", "promo", "catalogue", "auchan"  # fixed here
])


In [None]:
df_agg_final.shape

In [None]:
df_agg_final.columns 

### EDA Aggregated DataFrame df_agg_final

##### 1. Statistical Summary

In [None]:
df_agg_final.describe().T

##### 2. Correlation Matrix 

In [None]:
corr = df_agg_final.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(5, 3))
sns.heatmap(corr, 
            mask=mask, 
            cmap='coolwarm', 
            center=0, 
            linewidths=0.5, 
            annot=True)
plt.title("Aggregated Keywords Correlation Matrix")
plt.tight_layout()
plt.show()

##### 3. Boxplots

In [None]:
df_agg_final.plot(kind='box', vert=False, figsize=(6, 3), title='SVI Aggregated Distribution')
plt.tick_params(axis='x', which='both', labeltop=True)
plt.grid(axis='x', linestyle=':', linewidth=0.7)
plt.xticks(np.arange(0, 70, 5))
plt.tight_layout()
plt.show()

##### 4. Time Series Trends

In [None]:
# Plotting all keywords 
plt.figure(figsize=(12, 4))

for idx, keyword in enumerate(df_agg_final.columns):
    plt.plot(df_agg_final.index, df_agg_final[keyword], label=keyword)

plt.title("Search Interest Over Time by Aggregated Keyword", fontsize=11)
plt.xlabel("Date")
plt.ylabel("Google SVI (0–100)")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=8)
plt.tight_layout()
plt.grid(True)
plt.show()

# Carrefour France stock data
This data was collectedat daily and weekly frequenciesy.

In [None]:
#read CSV using Spark
df_fin = spark.read.csv('search-trends-vs-financial-markets/Collected Data/carrefour_stock_data.csv', header=True, inferSchema=True)
df_wfin = spark.read.csv('search-trends-vs-financial-markets/Collected Data/carrefour_stock_weekly.csv', header=True, inferSchema=True)

#### Daily stocks

In [None]:
#show preview daily stocks
df_fin.show(5)
df_fin.printSchema()

#### Weekly stocks

In [None]:
#show preview stocks
df_wfin.show(5)
df_wfin.printSchema()

##### Convert PySpark DataFrame to Pandas to analyse 

In [None]:
df_wfin = df_wfin.toPandas()

In [None]:
df_wfin.shape

### The time issue. 
Upon reviewing the collected datasets, a discrepancy in date labelling was identified between the weekly stock data from `yfinance` and the Google Trends keyword data. To facilitate data comparison and accurate time-series modelling, all datasets must share a standard, synchronised timeframe.

1. *Stock Market Data*

The weekly stock data from `yfinance` uses Monday as the label for each weekly observation. However, each row in the dataset represents the week ending Friday, but is indexed by the preceding Monday.

2. *Google Trends Data*

Google Trends aggregates search interest weekly, with each week's data point labelled by Sunday, the end of the search week.


This results in a misalignment between:
* Stock closing prices (on Friday, labelled as Monday),
* Search volume data (ending Sunday).

The solution to align both datasets:
* Google Trends dates will be shifted −2 days (from Sunday → Friday) to represent the end of the same week as the stock market.
* Stock data dates will be shifted +4 days (from Monday → Friday) to reflect the actual trading day.

This ensures that both data sources are indexed by the same Friday date, making them directly comparable for all subsequent analysis.

##### Adjust Google Trends dates (from Sunday to Friday)

In [None]:
df_agg_final.index = df_agg_final.index - pd.Timedelta(days=2)

##### Adjust Stock Data dates (from Monday to Friday)

In [None]:
df_wfin['Date'] = pd.to_datetime(df_wfin['Date']) + pd.Timedelta(days=4)

In [None]:
#set date as index
df_wfin.set_index('Date', inplace=True)

In [None]:
print('Google Trends')
print(df_agg_final.index.min(), df_agg_final.index.max())
print('yFinance')
print(df_wfin.index.min(), df_wfin.index.max())

There is a date range difference between the two datasets; the `df_agg_final` starts 2 weeks earlier and finishes one week earlier than `df_wfin`.

In [None]:
# Drop the first two weeks from Google Trends
df_agg_final = df_agg_final.iloc[2:]

# Drop the last week from stock data
df_wfin = df_wfin.iloc[:-1]

In [None]:
print('Google Trends')
print(df_agg_final.index.min(), df_agg_final.index.max())
print('yFinance')
print(df_wfin.index.min(), df_wfin.index.max())

##### 1. Statistical Summary

In [None]:
df_wfin.info()

In [None]:
df_wfin.describe().T

##### 2. Close price over time

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(df_wfin.index, df_wfin['Close'], label='Close Price', color='violet')
plt.title("Weekly Close Price Over Time")
plt.xlabel("Date")
plt.ylabel("Price (€)")
plt.grid(True)
plt.tight_layout()
plt.show()

##### 3. Volume over time

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(df_wfin.index, df_wfin['Volume'], label='Volume', color='orange')
plt.title("Weekly Trading Volume")
plt.xlabel("Date")
plt.ylabel("Volume")
plt.grid(True)
plt.tight_layout()
plt.show()

##### 4. Rolling mean

In [None]:
df_wfin['rolling_mean'] = df_wfin['Close'].rolling(window=4).mean()
df_wfin['rolling_std'] = df_wfin['Close'].rolling(window=4).std()

df_wfin[['Close', 'rolling_mean']].plot(figsize=(10,4), title='Close Price with Rolling Mean (4 weeks)')
plt.grid(True)
plt.tight_layout()
plt.show()