In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = '/content/Air Quality data.csv'  # Replace with your file's name
data = pd.read_csv(file_path)

# Data Cleaning
# Convert 'Date of the record' to datetime, handling errors
data['Date of the record'] = pd.to_datetime(data['Date of the record'], errors='coerce')

# Identify and drop rows with invalid dates
invalid_dates = data[data['Date of the record'].isna()]
print("Invalid Dates:")
print(invalid_dates)

data = data.dropna(subset=['Date of the record'])

# Extract additional time-related columns
data['Month'] = data['Date of the record'].dt.month  # Extract month for seasonal analysis
data['Year'] = data['Date of the record'].dt.year  # Extract year for yearly trends

# Convert numeric columns to proper types
for col in ['PM2.5 levels', 'PM10 levels.', 'NH3 (Ammonia) levels.', 'NO2 (Nitrogen Dioxide) levels', 'Xylene']:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Drop rows with missing AQI values
data = data.dropna(subset=['Air Quality Index (AQI) value'])

# 1. Analyze Variations Across Cities
city_aqi = data.groupby('Name of the city')['Air Quality Index (AQI) value'].mean().sort_values(ascending=False)
print("Average AQI by City:")
print(city_aqi)

# 2. Analyze Seasonal Variations
seasonal_aqi = data.groupby('Month')['Air Quality Index (AQI) value'].mean()
print("Average AQI by Month:")
print(seasonal_aqi)

# 3. Identify Key Pollutants
# Filter numeric columns for correlation calculation
numeric_columns = data.select_dtypes(include=[np.number])  # Select only numeric columns
correlation = numeric_columns.corr()

print("Correlation with AQI:")
print(correlation['Air Quality Index (AQI) value'].sort_values(ascending=False))

# 4. Visualizations
# a. AQI Trends Over Time
plt.figure(figsize=(12, 6))
sns.lineplot(data=data, x='Date of the record', y='Air Quality Index (AQI) value', hue='Name of the city', legend=False)
plt.title('AQI Trends Over Time')
plt.xlabel('Date')
plt.ylabel('AQI')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# b. Seasonal AQI Variation
plt.figure(figsize=(8, 4))
sns.barplot(x=seasonal_aqi.index, y=seasonal_aqi.values, palette='viridis')
plt.title('Seasonal AQI Variation')
plt.xlabel('Month')
plt.ylabel('Average AQI')
plt.show()

# c. Pollutant Contribution to Poor AQI
plt.figure(figsize=(10, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Between Pollutants and AQI')
plt.show()

# Recommendations
print("Key pollutants responsible for poor AQI based on correlation: PM2.5, PM10, NOx, and CO.")
print("Cities with highest average AQI:", city_aqi.head(5).index.tolist())
print("Season with worst air quality:", seasonal_aqi.idxmax())
