In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import kagglehub

# Download latest version
path = kagglehub.dataset_download("patricklford/global-ev-sales-2010-2024")

print("Path to dataset files:", path)

# Step 1: Analyze and visualize the EV stock share from 2010 - 2023

## Initialize the Pandas Data-Frame

In [None]:
dataset_name = path + "/IEA Global EV Data 2024.csv"
pandas_frame = pd.read_csv(dataset_name)

In [None]:
pandas_frame.head(100)

## Filter the DataFrame for the relevant metric (EV stock share)

Filtering the dataframe for the stock share metric and exclude fields that are not relevant for further calculations.

In [None]:
electric_vehicle_frame = pandas_frame[(pandas_frame['parameter'] == 'EV stock share') & (pandas_frame['mode'] == 'Cars')]
electric_vehicle_frame = electric_vehicle_frame[['region', 'year', 'value']]
electric_vehicle_frame = electric_vehicle_frame.drop_duplicates(subset=['year', 'region'], keep='last')
electric_vehicle_frame = electric_vehicle_frame[electric_vehicle_frame['year'] <= 2023] # Cutoff for air pollution data

In [None]:
electric_vehicle_frame.head(10)

## Visualize the EV stock share over the years

In [None]:
pivot_df = electric_vehicle_frame.pivot(index='year', columns='region', values='value')
plt.figure(figsize=(15, 8))
    
for country in pivot_df.columns:
    plt.plot(pivot_df.index, pivot_df[country], marker='o', label=country)
    
plt.title('EV Stock Share by Country (2010-2023)', fontsize=14)
plt.xlabel('Year', fontsize=12)
plt.ylabel('EV Stock Share', fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 2: Analyze and visualize the Air-Quality-Data from 2010 - 2023

In [None]:
dataset_name = "/kaggle/input/air-quality-2018-2024-iqair-ag/air-quality-2018-2023.json"
air_quality_frame = pd.read_json(dataset_name)

In [None]:
air_quality_frame.head(10)

## Filter the DataFrame for the relevant metric (country, 2018, 2019,...)

In [None]:
air_quality_frame = air_quality_frame[['country', 'avg2018', 'avg2019', 'avg2020', 'avg2021', 'avg2022','avg2023']]

# Remove 'avg' from column names
air_quality_frame.columns = air_quality_frame.columns.str.replace('avg', '')

# Get list of countries from filtered_frame
ev_countries = electric_vehicle_frame['region'].unique()

# Filter air_quality_frame to keep only countries we have ev data about
air_quality_frame = air_quality_frame[air_quality_frame['country'].isin(ev_countries)]

In [None]:
air_quality_frame.head(10)

## Append datasets from 2010-2017 to the 2018-2023 dataset

In [None]:
# Second dataset processing (2010-2017)
for year in range(2010, 2018):
    dataset_path = "/kaggle/input/aq-who-2010-2017/data_AQ_WHO/AQ" + str(year) + ".csv"
    # Read WHO data
    who_data = pd.read_csv(dataset_path, sep=None, engine='python')[['First Location', 'First Period', 'First FactValueForMeasure']]

    if str(year) not in air_quality_frame.columns:
        air_quality_frame[str(year)] = 0.0  # Initialize with float instead of int
    else:
        # Convert existing column to float if it exists
        air_quality_frame[str(year)] = air_quality_frame[str(year)].astype(float)
    
    # Create a dictionary mapping countries to their values
    value_map = dict(zip(who_data['First Location'], who_data['First FactValueForMeasure']))
    
    # Update the year column with values where country names match
    for idx in air_quality_frame.index:
        country_name = air_quality_frame.loc[idx, 'country']
        if country_name == "USA":
            country_name = "United States of America"
        if country_name == "Netherlands":
            country_name = "Netherlands (Kingdom of the)"
        if country_name == "United Kingdom":
            country_name = "United Kingdom of Great Britain and Northern Ireland"
        if country_name in value_map:
            air_quality_frame.at[idx, str(year)] = float(value_map[country_name])
            
cols = ['country'] + sorted([col for col in air_quality_frame.columns if col != 'country'])
air_quality_frame = air_quality_frame.reindex(cols, axis=1)
air_quality_frame.head(100)

## Visualize the Air Pollution over the years

In [None]:
plt.figure(figsize=(15, 8))

# Iterate through each country
for index, row in air_quality_frame.iterrows():
    country = row['country']
    # Get the years data (excluding the country column) and plot
    rows = []
    for year in range(2010, 2024):
        rows.append(str(year))
    years_data = row[rows]
    plt.plot(years_data.index.astype(int), years_data.values, marker='o', label=country)

plt.title('Air Pollution by Country (2010-2023)', fontsize=14)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Pollution (PM 2.5)', fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

# Step 3: Analysis of the correlation of the EV stock share and the air pollution

## Calculation of the Pearson Correlation Coefficient

In [None]:
# First, reshape air_pollution data to long format
air_quality_long = air_quality_frame.melt(
    id_vars=['country'],
    var_name='year',
    value_name='air_quality'
)

# Rename the 'value' column in electric vehicle data for clarity
electric_vehicle_frame = electric_vehicle_frame.rename(columns={'value': 'ev_adoption'})

# Convert year to numeric
air_quality_long['year'] = pd.to_numeric(air_quality_long['year'])

# Rename the 'value' column in electric vehicle data for clarity
electric_vehicle_frame = electric_vehicle_frame.rename(columns={'value': 'ev_adoption'})

# Merge the datasets
merged_df = pd.merge(
    air_quality_long,
    electric_vehicle_frame,
    on='year',
    how='inner'
)

# Calculate correlation coefficient for each country
correlation_df = merged_df.groupby('country').apply(
    lambda x: x['air_quality'].corr(x['ev_adoption'])
).reset_index()
correlation_df.columns = ['country', 'correlation']

# Sort by absolute correlation value
correlation_df['abs_corr'] = correlation_df['correlation'].abs()
correlation_df = correlation_df.sort_values('abs_corr', ascending=False).drop('abs_corr', axis=1)

# Display the results
correlation_df

## Analysing the correlations for each country

In [None]:
# Calculate 90% confidence interval
n = len(correlation_df)
z_score = stats.norm.ppf(0.975)  # for 90% CI (two-sided)
    
# Fisher's Z-Transformation for more precise confidence intervals
z_transform = np.arctanh(correlation_df['correlation'])
stderr = 1/np.sqrt(n-3)
ci_lower = np.tanh(z_transform - (z_score * stderr))
ci_upper = np.tanh(z_transform + (z_score * stderr))

# Add confidence intervals to DataFrame
correlation_df['ci_lower'] = ci_lower
correlation_df['ci_upper'] = ci_upper

# Count countries with at least weak correlation (|r| >= 0.2)
weak_corr = correlation_df[abs(correlation_df['correlation']) >= 0.2]
no_corr = correlation_df[abs(correlation_df['correlation']) < 0.2]

n_weak = len(weak_corr)
n_none = len(no_corr)

# Calculate percentages
percent_weak = (n_weak / n) * 100
percent_none = (n_none / n) * 100

# Create pie chart
plt.figure(figsize=(10, 8))
labels = [f'At least weak\ncorrelation ({percent_weak:.1f}%)', 
          f'No significant\ncorrelation ({percent_none:.1f}%)']
sizes = [n_weak, n_none]
colors = ['#ff9999', '#66b3ff']

plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
        startangle=90, shadow=True)
plt.axis('equal')
plt.title('Distribution of Correlation Strengths')

# Show the plot
plt.show()


In [None]:
# Calculate R-squared (coefficient of determination)
correlation_df['r_squared'] = correlation_df['correlation'] ** 2

# Sort by R-squared value in descending order
correlation_df_sorted = correlation_df.sort_values('r_squared', ascending=False)

# Create a summary DataFrame with both r and r² values
summary_df = correlation_df_sorted[['country', 'correlation', 'r_squared']].copy()
summary_df.columns = ['Country', 'r (Correlation)', 'r² (Coefficient of Determination)']

# Round the values for better readability
summary_df = summary_df.round(4)

# Display results
print("R-squared Analysis:")
print("\nTop 5 countries with highest explained variance:")
print(summary_df.head().to_string(index=False))
print("\nBottom 5 countries with lowest explained variance:")
print(summary_df.tail().to_string(index=False))

# Calculate average r² value
mean_r_squared = correlation_df['r_squared'].mean()
print(f"\nAverage r² across all countries: {mean_r_squared:.4f}")
print(f"This means on average {(mean_r_squared * 100):.1f}% of the variance is explained by the correlation")


In [None]:
# Get 2023 air pollution data
air_quality_2023 = air_quality_long[
    (air_quality_long['year'] == 2023) & 
    (~air_quality_long['country'].isin(['China', 'India']))
][['country', 'air_quality']].reset_index(drop=True)

# Get the latest EV adoption value per country (assuming this represents current share)
latest_ev = electric_vehicle_frame.sort_values('year').groupby('region').last().reset_index()
latest_ev = latest_ev.rename(columns={'region': 'country'})

# Merge 2023 air pollution with EV data
analysis_df = pd.merge(
    air_quality_2023,
    latest_ev[['country', 'ev_adoption']],
    on='country',
    how='inner'
)

# Calculate the overall correlation coefficient
overall_correlation = analysis_df['air_quality'].corr(analysis_df['ev_adoption'])
r_squared = overall_correlation ** 2

# Create a scatter plot to visualize the relationship
plt.figure(figsize=(10, 6))
plt.scatter(analysis_df['ev_adoption'], analysis_df['air_quality'])

# Add country labels to points
for i, row in analysis_df.iterrows():
    plt.annotate(row['country'], (row['ev_adoption'], row['air_quality']), 
                xytext=(5, 5), textcoords='offset points')

plt.xlabel('Electric Vehicle Share')
plt.ylabel('Air Pollution (2023)')
plt.title('Relationship between EV Share and Air Pollution in 2023')

# Add trend line
z = np.polyfit(analysis_df['ev_adoption'], analysis_df['air_quality'], 1)
p = np.poly1d(z)
plt.plot(analysis_df['ev_adoption'], p(analysis_df['ev_adoption']), "r--", alpha=0.8)

plt.grid(True, alpha=0.3)
plt.show()

# Print results
print(f"Overall correlation coefficient between 2023 air pollution and EV share: {overall_correlation:.4f}")
print(f"\nr² across all countries: {r_squared:.4f}")
print(f"This means on average {(r_squared * 100):.1f}% of the variance is explained by the correlation")