# Task 2 - Time-Series Forecasting (Noida AQI)
This notebook creates lag features (7-day window), trains three models (persistence, linear regression, random forest), evaluates them, and plots predictions.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt

# Load and prepare data
df = pd.read_csv('/mnt/data/Noida_AQIBulletins.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)
ts = df[['date','Index Value']].set_index('date').asfreq('D')
ts['Index Value'] = ts['Index Value'].ffill()

window = 7
cols = []
for i in range(window):
    ts[f'lag_{i+1}'] = ts['Index Value'].shift(i+1)
    cols.append(f'lag_{i+1}')

ts['target'] = ts['Index Value'].shift(-1)

data = ts.dropna().copy()

split_idx = int(len(data)*0.8)
train = data.iloc[:split_idx].copy()
test = data.iloc[split_idx:].copy()

X_train = train[cols].values
y_train = train['target'].values
X_test = test[cols].values
y_test = test['target'].values

# Models
lr = LinearRegression(); lr.fit(X_train, y_train)
rf = RandomForestRegressor(n_estimators=100, random_state=42); rf.fit(X_train, y_train)

y_pred_persistence = X_test[:,0]
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error

def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100 if mask.any() else np.nan

for name, y_pred in [('Persistence', y_pred_persistence), ('LinearRegression', y_pred_lr), ('RandomForest', y_pred_rf)]:
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mape_val = mape(y_test, y_pred)
    print(name, 'MAE={:.3f}, RMSE={:.3f}, MAPE={:.2f}%'.format(mae, rmse, mape_val))

# Plot last 120 days
import matplotlib.pyplot as plt
n_plot = min(120, len(test))
plot_idx = test.index[-n_plot:]
plt.figure(figsize=(12,5))
plt.plot(plot_idx, y_test[-n_plot:], label='Actual')
plt.plot(plot_idx, y_pred_persistence[-n_plot:], label='Persistence')
plt.plot(plot_idx, y_pred_lr[-n_plot:], label='LinearRegression')
plt.plot(plot_idx, y_pred_rf[-n_plot:], label='RandomForest')
plt.xlabel('Date')
plt.ylabel('Index Value')
plt.title('Predicted vs Actual - Last {} days'.format(n_plot))
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()