In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import STL
import numpy as np

In [None]:
# Load the dataset
file_path = '/content/explore_pearse_street.csv'
data = pd.read_csv(file_path)

# Count the number of empty rows in each column
empty_rows_per_column = data.isnull().sum()
print(f"Number of empty rows per column: {empty_rows_per_column}")

In [None]:
# Distribution of PM10 data
plt.figure(figsize=(10, 6))
data['PM10'].dropna().hist(bins=50, edgecolor='black')
plt.title('Distribution of PM10 Data')
plt.xlabel('PM10')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()

In [None]:
# PM10 data by hour (max and min values)
data['Date and Time'] = pd.to_datetime(data['Date and Time'], infer_datetime_format=True,dayfirst=True, utc=True, errors='coerce')
data['Hour'] = data['Date and Time'].dt.hour

pm10_by_hour = data.groupby('Hour')['PM10'].agg(['max', 'min']).reset_index()

plt.figure(figsize=(12, 6))
plt.plot(pm10_by_hour['Hour'], pm10_by_hour['max'], marker='o', linestyle='-', label='Max PM10')
plt.plot(pm10_by_hour['Hour'], pm10_by_hour['min'], marker='o', linestyle='-', label='Min PM10')
plt.title('Max and Min PM10 Values by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('PM10')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Average PM10 levels by hour
avg_pm10_by_hour = data.groupby('Hour')['PM10'].mean().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(avg_pm10_by_hour['Hour'], avg_pm10_by_hour['PM10'], marker='o', linestyle='-', label='Average PM10')
plt.title('Average PM10 Levels by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Average PM10')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# PM10 levels by day of the week
data['DayOfWeek'] = data['Date and Time'].dt.dayofweek

avg_pm10_by_day = data.groupby('DayOfWeek')['PM10'].mean().reset_index()

days = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
avg_pm10_by_day['DayOfWeek'] = avg_pm10_by_day['DayOfWeek'].map(days)

plt.figure(figsize=(12, 6))
plt.plot(avg_pm10_by_day['DayOfWeek'], avg_pm10_by_day['PM10'], marker='o', linestyle='-', label='Average PM10')
plt.title('Average PM10 Levels by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Average PM10')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Highest daily average of PM10
daily_avg_pm10 = data.groupby(data['Date and Time'].dt.date)['PM10'].mean().reset_index()
highest_daily_avg_pm10 = daily_avg_pm10.loc[daily_avg_pm10['PM10'].idxmax()]
print(highest_daily_avg_pm10)

In [None]:
# PM10 trends over months
data['Month'] = data['Date and Time'].dt.to_period('M')
avg_pm10_by_month = data.groupby('Month')['PM10'].mean().reset_index()
avg_pm10_by_month['Month'] = avg_pm10_by_month['Month'].dt.to_timestamp()

plt.figure(figsize=(12, 6))
plt.plot(avg_pm10_by_month['Month'], avg_pm10_by_month['PM10'], marker='o', linestyle='-', label='Average PM10')
plt.title('Average PM10 Levels by Month')
plt.xlabel('Month')
plt.ylabel('Average PM10')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Number of outliers in PM10
Q1 = data['PM10'].quantile(0.25)
Q3 = data['PM10'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data['PM10'] < lower_bound) | (data['PM10'] > upper_bound)]
num_outliers = outliers.shape[0]
print(num_outliers)

In [None]:
# Drop rows with any missing values
cleaned_data = data.dropna()
cleaned_data_shape = cleaned_data.shape
print(cleaned_data_shape)

# Columns in the cleaned dataset
cleaned_data_columns = cleaned_data.columns.tolist()
print(cleaned_data_columns)

# Plot distributions of all pollutants
plt.figure(figsize=(16, 12))
plt.subplot(2, 2, 1)
cleaned_data['NO2'].hist(bins=50, edgecolor='black')
plt.title('Distribution of NO2')
plt.xlabel('NO2')
plt.ylabel('Frequency')

#plt.subplot(2, 2, 2)
#cleaned_data['O3'].hist(bins=50, edgecolor='black')
#plt.title('Distribution of O3')
#plt.xlabel('O3')
#plt.ylabel('Frequency')

plt.subplot(2, 2, 3)
cleaned_data['PM10'].hist(bins=50, edgecolor='black')
plt.title('Distribution of PM10')
plt.xlabel('PM10')
plt.ylabel('Frequency')

plt.subplot(2, 2, 4)
cleaned_data['PM2.5'].hist(bins=50, edgecolor='black')
plt.title('Distribution of PM2.5')
plt.xlabel('PM2.5')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Seasonal decomposition of PM10 data
cleaned_data = cleaned_data.sort_values('Date and Time')
cleaned_data.set_index('Date and Time', inplace=True)
stl = STL(cleaned_data['PM10'], period=24)
result = stl.fit()

plt.figure(figsize=(14, 10))
plt.subplot(4, 1, 1)
plt.plot(result.observed)
plt.title('Observed PM10')

plt.subplot(4, 1, 2)
plt.plot(result.trend)
plt.title('Trend')

plt.subplot(4, 1, 3)
plt.plot(result.seasonal)
plt.title('Seasonal')

plt.subplot(4, 1, 4)
plt.plot(result.resid)
plt.title('Residual')

plt.tight_layout()
plt.show()

In [None]:
# Summary statistics for PM10
summary_stats = cleaned_data['PM10'].describe()
percentiles = cleaned_data['PM10'].quantile([0.1, 0.25, 0.5, 0.75, 0.9])
summary_stats_combined = pd.concat([summary_stats, percentiles.rename(index={0.1: '10%', 0.25: '25%', 0.5: '50%', 0.75: '75%', 0.9: '90%'})])
print(summary_stats_combined)

In [None]:
# Save the filtered dataframe to a CSV file
filtered_csv_path = 'process_amiens_street_data.csv'
cleaned_data.to_csv(filtered_csv_path)

# If running in Google Colab, use the following code to download the CSV file:
from google.colab import files
files.download(filtered_csv_path)