In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import numpy as np

In [None]:
# Load the dataset
file_path = '/content/process_pearse_street_data.csv'
data = pd.read_csv(file_path)

# Convert 'Date and Time' to datetime and set it as the index
data['Date and Time'] = pd.to_datetime(data['Date and Time'])
data.set_index('Date and Time', inplace=True)

In [None]:
data.columns

In [None]:
data.drop(['Hour', 'DayOfWeek', 'Month'], axis=1, inplace=True)

In [None]:
data.shape

In [None]:
# Drop rows with missing PM10 values
data_cleaned = data.dropna(subset=['PM10'])

# Resample to daily average
df = data_cleaned.resample('D').mean()

df = df.dropna()

In [None]:
# Time series plot for each pollutant
plt.figure(figsize=(14, 8))
plt.plot(df.index, df['PM10'], label='PM10')
plt.plot(df.index, df['NO2'], label='NO2')
#plt.plot(df.index, df['O3'], label='O3')
plt.plot(df.index, df['PM2.5'], label='PM2.5')
plt.title('Time Series of Pollutants')
plt.xlabel('Date')
plt.ylabel('Concentration')
plt.legend()
plt.show()

In [None]:
# Histogram for each pollutant
df[['PM10', 'NO2', 'PM2.5']].hist(bins=30, figsize=(14, 10))
plt.suptitle('Histograms of Pollutants')
plt.show()

In [None]:
# Box plot for each pollutant
plt.figure(figsize=(14, 8))
sns.boxplot(data=df[['PM10', 'NO2', 'PM2.5']])
plt.title('Box Plots of Pollutants')
plt.show()

In [None]:
# Pair plot
sns.pairplot(df[['PM10', 'NO2', 'PM2.5']])
plt.suptitle('Pair Plots of Pollutants')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0, linewidths=0.5, linecolor='black')
plt.title('Correlation Heatmap for All Features')
plt.show()

In [None]:
# Lag plot for PM10
pd.plotting.lag_plot(df['PM10'])
plt.title('Lag Plot of PM10')
plt.show()

In [None]:
# Rolling mean and standard deviation plot for PM10
plt.figure(figsize=(14, 8))
rolling_mean = df['PM10'].rolling(window=12).mean()
rolling_std = df['PM10'].rolling(window=12).std()
plt.plot(df['PM10'], label='PM10')
plt.plot(rolling_mean, label='Rolling Mean')
plt.plot(rolling_std, label='Rolling Std')
plt.title('Rolling Mean and Standard Deviation of PM10')
plt.legend()
plt.show()

In [None]:
# Plot ACF and PACF for the PM10
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# ACF plot
plot_acf(df['PM10'].dropna(), lags=40, ax=axes[0])
axes[0].set_title('Autocorrelation Function (ACF)')

# PACF plot
plot_pacf(df['PM10'].dropna(), lags=40, ax=axes[1])
axes[1].set_title('Partial Autocorrelation Function (PACF)')

plt.show()

In [None]:
# Create lag features for PM10 based on significant lags based on ACF and PACF plots
for lag in range(1, 4):
    df[f'PM10_Lag_{lag}'] = df['PM10'].shift(lag)

# Create rolling mean features for PM10
df['PM10_RollingMean_3'] = df['PM10'].rolling(window=3).mean()
df['PM10_RollingMean_7'] = df['PM10'].rolling(window=7).mean()
df['PM10_RollingMean_1'] = df['PM10'].rolling(window=1).mean()
df['PM10_RollingMean_10'] = df['PM10'].rolling(window=10).mean()

# Create lag features for other pollutants (NO2, O3, PM2.5)
for pollutant in ['NO2', 'PM2.5']:
    for lag in range(1, 8):
        df[f'{pollutant}_Lag_{lag}'] = df[pollutant].shift(lag)

# Create rolling mean features for other pollutants
for pollutant in ['NO2', 'PM2.5']:
    df[f'{pollutant}_RollingMean_3'] = df[pollutant].rolling(window=3).mean()
    df[f'{pollutant}_RollingMean_7'] = df[pollutant].rolling(window=7).mean()

# Drop rows with NaN values resulting from the lag and rolling mean operations
df = df.dropna()

In [None]:
# Calculate the percent change for each pollutant
df['NO2_Pct_Change'] = df['NO2'].pct_change() * 100
#df['O3_Pct_Change'] = df['O3'].pct_change() * 100
df['PM10_Pct_Change'] = df['PM10'].pct_change() * 100
df['PM2.5_Pct_Change'] = df['PM2.5'].pct_change() * 100

# Drop rows with NaN values resulting from the percent change calculation
df = df.dropna()

In [None]:
# Create derived features
df['Day_of_Week'] = df.index.dayofweek
df['Is_Weekend'] = df['Day_of_Week'].apply(lambda x: 1 if x >= 5 else 0)
#df['NO2_O3_Ratio'] = df['NO2'] / df['O3']
df['PM10_NO2_Product'] = df['PM10'] * df['NO2']
df['PM10_PM2.5_Ratio'] = df['PM10'] / df['PM2.5']
df['NO2_RollingStd_3'] = df['NO2'].rolling(window=3).std()
#df['O3_RollingMax_7'] = df['O3'].rolling(window=7).max()
df['PM2.5_RollingMin_3'] = df['PM2.5'].rolling(window=3).min()
df['Cumulative_NO2'] = df['NO2'].cumsum()
df['Cumulative_PM10'] = df['PM10'].cumsum()

# Drop rows with NaN values resulting from the rolling calculations
df = df.dropna()

In [None]:
# Calculate the correlation matrix
correlation_matrix_final = df.corr()

# Display the correlation matrix
print(correlation_matrix_final)

# Plot the correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix_final, annot=True, cmap='coolwarm', center=0, linewidths=0.5, linecolor='black')
plt.title('Correlation Heatmap for Final Dataset with Derived Features')
plt.show()

In [None]:
df.info()

In [None]:
# Extract the correlation values for PM10 and sort them in descending order
pm10_correlations = correlation_matrix_final[['PM10']].sort_values(by='PM10', ascending=False)

print(pm10_correlations)

In [None]:
# Filter columns with correlation higher than 0.5 with PM10
high_corr_columns = correlation_matrix_final.index[correlation_matrix_final['PM10'].abs() > 0.5]
filtered_df = df[high_corr_columns]

In [None]:
filtered_df.columns

In [None]:
selected_vars = high_corr_columns.drop('PM10').tolist()
print("Selected Variables after Linear Correlation Screening:\n", selected_vars)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Non-linear Correlation Analysis on Selected Variables
X = df[selected_vars]
y = df['PM10']

# Mutual Information
mi = mutual_info_regression(X, y)
mi_df = pd.DataFrame(mi, index=X.columns, columns=['Mutual Information'])
mi_df.sort_values(by='Mutual Information', ascending=False, inplace=True)

# Random Forest Feature Importance
rf = RandomForestRegressor()
rf.fit(X, y)
importance = rf.feature_importances_

importance_df = pd.DataFrame(importance, index=X.columns, columns=['Importance'])
importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display results
results_df = pd.concat([mi_df, importance_df], axis=1)
print("Mutual Information and Feature Importance:\n", results_df)

In [None]:
# Visualization
sns.pairplot(df[['PM10'] + selected_vars])
plt.show()

In [None]:
# Define the thresholds for high importance and low mutual information
mi_threshold = 0.1
importance_threshold = 0.05

# Identify variables to keep
variables_to_keep = results_df[(results_df['Mutual Information'] >= mi_threshold) |
                               (results_df['Importance'] >= importance_threshold)].index.tolist()

# Sort the variables to keep by their importance and mutual information scores
sorted_results_df = results_df.loc[variables_to_keep].sort_values(by=['Importance', 'Mutual Information'], ascending=False)

# Display the most important variables
print("Sorted Important Variables:\n", sorted_results_df)

In [None]:
# Save the filtered dataframe to a CSV file
filtered_csv_path = 'model_amiens_street_data.csv'
filtered_df.to_csv(filtered_csv_path)

# If running in Google Colab, use the following code to download the CSV file:
from google.colab import files
files.download(filtered_csv_path)