In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [15]:
df = pd.read_csv('weather data.csv')

In [16]:
df.head()

Unnamed: 0,city_name,dt_txt,temp,pressure,humidity,state
0,Amarawati,2020-01-29 08:30:00+05:30,23.45,1016,59,Maharashtra
1,Amarawati,2020-01-29 11:30:00+05:30,27.91,1015,42,Maharashtra
2,Amarawati,2020-01-29 14:30:00+05:30,31.14,1010,34,Maharashtra
3,Amarawati,2020-01-29 17:30:00+05:30,30.94,1010,32,Maharashtra
4,Amarawati,2020-01-29 20:30:00+05:30,26.14,1012,41,Maharashtra


In [17]:
df['dt_txt'] = pd.to_datetime(df['dt_txt'])

# Extract hour, day, month
df['hour'] = df['dt_txt'].dt.hour
df['day'] = df['dt_txt'].dt.day
df['month'] = df['dt_txt'].dt.month


In [18]:
df.head()

Unnamed: 0,city_name,dt_txt,temp,pressure,humidity,state,hour,day,month
0,Amarawati,2020-01-29 08:30:00+05:30,23.45,1016,59,Maharashtra,8,29,1
1,Amarawati,2020-01-29 11:30:00+05:30,27.91,1015,42,Maharashtra,11,29,1
2,Amarawati,2020-01-29 14:30:00+05:30,31.14,1010,34,Maharashtra,14,29,1
3,Amarawati,2020-01-29 17:30:00+05:30,30.94,1010,32,Maharashtra,17,29,1
4,Amarawati,2020-01-29 20:30:00+05:30,26.14,1012,41,Maharashtra,20,29,1


In [19]:
df = df.drop(columns=['dt_txt'])


In [20]:

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
import pickle

In [21]:
from sklearn.preprocessing import LabelEncoder

le_city = LabelEncoder()
le_state = LabelEncoder()

df['city_enc'] = le_city.fit_transform(df['city_name'])
df['state_enc'] = le_state.fit_transform(df['state'])

# Features
X = df[['hour', 'day', 'month', 'city_enc', 'state_enc']]
y = df[['temp', 'humidity', 'pressure']]  # multi-output


In [22]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
import pickle

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost Regressor
base_model = XGBRegressor()
multi_model = MultiOutputRegressor(base_model)
multi_model.fit(X_train, y_train)

# Save model & encoders
pickle.dump(multi_model, open("weather_multi_model.pkl", "wb"))
pickle.dump(le_city, open("le_city.pkl", "wb"))
pickle.dump(le_state, open("le_state.pkl", "wb"))

print("Multi-output model saved successfully!")


Multi-output model saved successfully!


In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predictions
y_pred = multi_model.predict(X_test)

# Evaluate each target separately
metrics = {}
for i, col in enumerate(y.columns):  # ['temp', 'humidity', 'pressure']
    mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])

    metrics[col] = {
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }

# Print metrics
for target, vals in metrics.items():
    print(f"\n📊 Metrics for {target}:")
    print(f"MAE  : {vals['MAE']:.2f}")
    print(f"RMSE : {vals['RMSE']:.2f}")
    print(f"R²   : {vals['R2']:.2f}")



📊 Metrics for temp:
MAE  : 1.36
RMSE : 1.82
R²   : 0.94

📊 Metrics for humidity:
MAE  : 7.33
RMSE : 9.86
R²   : 0.81

📊 Metrics for pressure:
MAE  : 0.89
RMSE : 1.19
R²   : 0.90


In [24]:
state_city_map = df.groupby("state")["city_name"].unique().apply(list).to_dict()

# Save it for later use in Streamlit
with open("state_city_map.pkl", "wb") as f:
    pickle.dump(state_city_map, f)
