# Urban Air Pollution Challenge


## Upload the data

In [None]:
import sys
print(sys.executable)

In [None]:
# Import of relevant packages
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score

from sklearn.linear_model import LinearRegression

In [None]:
# Upload Train data
df_train=pd.read_csv('data/Train.csv')

# Upload Test data
df_test=pd.read_csv('data/Test.csv')

In [None]:
df_train.columns

In [None]:
# Choosing to drop the Place_ID X Date, as it doesn't contain any additional information
df_train = df_train.drop('Place_ID X Date', axis = 1)

In [None]:
df_train.head(10)

## Split into train and validation set


In [None]:
# separate columns in target values, 'id' and numerical features
target_vars = ['target', 'target_min', 'target_max', 'target_variance', 'target_count']
id_cols = ['Place_ID', 'Date']
num_cols = [col for col in df_train.columns if col not in target_vars + id_cols and pd.api.types.is_numeric_dtype(df_train[col])]

In [None]:
X = df_train.drop(target_vars, axis=1)
Y = df_train['target']

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
X_train.info()

## Data cleaning and feature engineering

In [None]:
# Check the number of nans for each column
missing = pd.DataFrame(X_train.isnull().sum(), columns=["Amount"])
missing['Percentage'] = round((missing['Amount']/X_train.shape[0])*100, 2)
missing[missing['Amount'] != 0]

In [None]:
# Create missing data heatmap
plt.figure(figsize=(15, 8))

missing_data = X_train.isnull()
sns.heatmap(missing_data, yticklabels=False, cbar=True, cmap='viridis')
plt.title('Heatmap for check of missing data\n(Yellow = Missing, Dark = Present)', fontsize=14)
plt.xlabel('Features')
plt.ylabel('Observations')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

We remove all columns with more than 50% of missing data:

In [None]:
cols = X_train.columns[X_train.isna().mean() > 0.5].tolist()

In [None]:
X_train = X_train.drop(columns = cols)

X_train = X_train.reset_index(drop=True)

In [None]:
X_train.info()

In [None]:
X_train.describe()

### Preprocessing

For imputing, we consider the numerical columns:

In [None]:
num_cols = [col for col in X_train.columns if col not in id_cols and pd.api.types.is_numeric_dtype(X_train[col])]

In [None]:
X_train.columns

We impute the missing values using the mean per Place_ID

In [None]:
# Fill missing values per Place_ID using the group mean
X_imputed = X_train.copy()
X_imputed[num_cols] = X_train.groupby('Place_ID')[num_cols].transform(lambda x: x.fillna(x.mean()))
# Fill any remaining NaNs (if an entire group was missing for a feature)
# X_train_imputed = X_train_imputed.fillna(X_train_imputed.mean())

Check the missing values again, as there could still be some extra missing values:

In [None]:
missing = pd.DataFrame(X_imputed.isnull().sum(), columns=["Amount"])
missing['Percentage'] = round((missing['Amount']/X_imputed.shape[0])*100, 2)
missing[missing['Amount'] != 0]

Unnamed: 0,Amount,Percentage
L3_SO2_SO2_column_number_density,2,0.01
L3_SO2_SO2_column_number_density_amf,2,0.01
L3_SO2_SO2_slant_column_number_density,2,0.01
L3_SO2_absorbing_aerosol_index,2,0.01
L3_SO2_cloud_fraction,2,0.01
L3_SO2_sensor_azimuth_angle,2,0.01
L3_SO2_sensor_zenith_angle,2,0.01
L3_SO2_solar_azimuth_angle,2,0.01
L3_SO2_solar_zenith_angle,2,0.01


In [None]:
# define class to impute with mean by Place_ID
# from sklearn.base import BaseEstimator, TransformerMixin

# class GroupByPlaceIDImputer(BaseEstimator, TransformerMixin):
#     def __init__(self, place_id='Place_ID', strategy='mean'):
#         self.place_id = place_id
#         self.strategy = strategy
#     def fit(self, X, y=None):
#         self.group_mean_ = X.groupby(self.place_id).transform(self.strategy)
#         return self
#     def transform(self, X):
#         X_filled = X.copy()
#         for col in X.columns:
#             if col != self.place_id:
#                 mask = X_filled[col].isna()
#                 X_filled.loc[mask, col] = self.group_mean_.loc[mask, col]
#         return X_filled.drop(columns=[self.place_id])

We now define a preprocessing pipeline to impute all the missing NaNs and to scale all the data with a standard scaler

In [22]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # its optional to keep it as we already filled the missing values but its a safety layer for the future unseen data
    ('scaler', StandardScaler())
])


# Pipeline([
#     ('groupby_imputer', GroupByPlaceIDImputer(place_id='Place_ID', strategy='mean')),
#     ('simple_imputer', SimpleImputer(strategy='mean')),
#     ('std_scaler', StandardScaler())
# ])
preprocessor = ColumnTransformer([
    ('num', pipeline, num_cols),
], remainder='passthrough')

In [36]:
X_preprocessed = pd.DataFrame(preprocessor.fit_transform(X_train),columns=X_train.columns)

In [37]:
X_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24445 entries, 0 to 24444
Data columns (total 69 columns):
 #   Column                                               Non-Null Count  Dtype 
---  ------                                               --------------  ----- 
 0   Date                                                 24445 non-null  object
 1   Place_ID                                             24445 non-null  object
 2   precipitable_water_entire_atmosphere                 24445 non-null  object
 3   relative_humidity_2m_above_ground                    24445 non-null  object
 4   specific_humidity_2m_above_ground                    24445 non-null  object
 5   temperature_2m_above_ground                          24445 non-null  object
 6   u_component_of_wind_10m_above_ground                 24445 non-null  object
 7   v_component_of_wind_10m_above_ground                 24445 non-null  object
 8   L3_NO2_NO2_column_number_density                     24445 non-null  object


In [38]:
X_preprocessed.isna().sum()

Date                                    0
Place_ID                                0
precipitable_water_entire_atmosphere    0
relative_humidity_2m_above_ground       0
specific_humidity_2m_above_ground       0
                                       ..
L3_SO2_cloud_fraction                   0
L3_SO2_sensor_azimuth_angle             0
L3_SO2_sensor_zenith_angle              0
L3_SO2_solar_azimuth_angle              0
L3_SO2_solar_zenith_angle               0
Length: 69, dtype: int64

All good now! Let's move forward

## EDA

In [39]:
X_preprocessed.shape

(24445, 69)

In [40]:
X_preprocessed.columns

Index(['Date', 'Place_ID', 'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_sensor_altitude', 'L3_NO2_sensor_azimuth_angle',
       'L3_NO2_sensor_zenith_angle', 'L3_NO2_solar_azimuth_angle',
       'L3_NO2_solar_zenith_angle',
       'L3_NO2_stratospheric_NO2_column_number_density',
       'L3_NO2_tropopause_pressure',
       'L3_NO2_tropospheric_NO2_column_number_density',
       'L3_O3_O3_column_number_density', 'L3_O3_O3_effective_temperature',
       'L3_O3_cloud_fraction', 'L3_O3_sensor_azimuth_angle',
       'L3_O3_sensor_zenith_angle', 'L3_O3_solar_azimuth_angle',
       'L3_O3_solar_zenith_angle', 'L3_CO_CO

### Correlations between Pollutants and Target (PM2.5)


In [None]:
# Define pollutant columns
pollutants = [
    'L3_NO2_NO2_column_number_density',
    'L3_O3_O3_column_number_density',
    'L3_CO_CO_column_number_density',
    'L3_CO_H2O_column_number_density',
    'L3_SO2_SO2_column_number_density'
]

target = 'target'

# Rename pollutants for better readability
# pollutant_names = ['NO₂', 'O₃', 'CO', 'CO-H₂O', 'SO₂'] 

# Calculate correlation with target
correlations = X_preprocessed[pollutants].corrwith(y_train).sort_values(ascending=False)
print("Correlations between pollutants and PM2.5:")
print(correlations)

# Correlation Matrix Heatmap
plt.figure(figsize=(10, 8))
corr_matrix = X_preprocessed[pollutants].corr()

# Create a mapping for renaming columns and index
# column_mapping = dict(zip(pollutants, pollutant_names))
# column_mapping[target] = 'PM2.5 Target'

# Rename the correlation matrix columns and index
# corr_matrix_renamed = corr_matrix.rename(columns=column_mapping, index=column_mapping)

# Create a mask for the upper triangle to show only lower triangle (stairs effect)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, 
            fmt='.3f', square=True, linewidths=2, cbar_kws={"shrink": 0.8},
            vmin=-1, vmax=1, mask=mask)
plt.title('Correlation Matrix: Pollutants & PM2.5', fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Trainining the model

In [None]:
# Convert Date to datetime
X_preprocessed['Date'] = pd.to_datetime(X_preprocessed['Date'])
X_val['Date'] = pd.to_datetime(X_val['Date'])

In [None]:
X_preprocessed = X_preprocessed.drop(['Place_ID','Date'], axis=1)

In [None]:
## in order to exemplify how the predict will work, we will save the validation set
X_val.to_csv("data/X_val.csv")
y_val.to_csv("data/y_val.csv")

In [None]:
#training the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_preprocessed, y_train)

In [None]:
y_train_pred = reg.predict(X_preprocessed)
mse = mean_squared_error(y_train, y_train_pred)
print(mse)

In [None]:
rmse = np.sqrt(mse)
print(rmse)

In [None]:
r2 = r2_score(y_train, y_train_pred)
print(r2)