# Week 1: Dataset Exploration

This notebook covers initial data loading, inspection, and basic exploration of the India Air Quality dataset.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")


In [None]:
# Load the dataset
df = pd.read_csv("../data/data.csv", encoding="ISO-8859-1")
print("Shape of dataset:", df.shape)
df.head()

In [None]:
# View column names and data types
print("Columns:", df.columns.tolist())
df.info()

In [None]:
# Check missing values
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Summary statistics for numerical columns
df.describe()

In [None]:
# Parse date column
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Check date range
print("Date range:", df['date'].min(), "to", df['date'].max())

In [None]:
# Visualize records per year
df['year'].value_counts().sort_index().plot(kind='bar', figsize=(10, 5), title="Records per Year")
plt.xlabel("Year")
plt.ylabel("Number of Records")
plt.show()

In [None]:
# Most represented states and cities
print("Top 10 States:")
print(df['state'].value_counts().head(10))

print("\nTop 10 Locations:")
print(df['location'].value_counts().head(10))

## Summary

- The dataset contains over 430,000 entries spanning multiple years.
- Columns like `so2`, `no2`, `rspm`, `spm`, and `pm2_5` are crucial for analysis but have missing values.
- Some states and locations are much more represented than others.
- `date` column has been converted to datetime, and `year` and `month` extracted for time-based analysis.
