<a href="https://colab.research.google.com/github/kaisarfardin6620/Weather-rainfall-prediction/blob/main/Weather_rainfall_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pycaret[full]
!pip install pycaret[mlops]
!pip install pycaret[time-series]

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve, f1_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df =  pd.read_csv('/content/drive/MyDrive/dataset/weather.csv')

In [None]:
df_pycaret = df.copy()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.head(3)

In [None]:
df.tail(3)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter
reference_date = df['Date'].min()
df['DaysSince'] = (df['Date'] - reference_date).dt.days
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

In [None]:
df.drop('Date', axis=1, inplace=True)

In [None]:
df.head(3)

In [None]:
df.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
sns.pairplot(df)
plt.suptitle("Scatter Plot Matrix", y=1.02)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.title("Box Plots of Numerical Features")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(data=df)
plt.title("Violin Plots of Numerical Features")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df.plot(kind='density', subplots=True, layout=(3, 3), figsize=(15, 10), sharex=False)
plt.suptitle("Density Plots of Numerical Features", y=1.02)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.title("Box Plots of Numerical Features for Outlier Visualization")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
def detect_outliers_iqr(df):
    outlier_indices = {}
    for col in df.select_dtypes(include=np.number).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        if len(outliers) > 0:
            outlier_indices[col] = outliers

    return outlier_indices

outliers_by_feature = detect_outliers_iqr(df)

print("Outliers detected by IQR method:")
for feature, indices in outliers_by_feature.items():
    print(f"Feature '{feature}': Indices {list(indices)}")

if 'Rainfall' in outliers_by_feature:
    outlier_indices_rainfall = outliers_by_feature['Rainfall']
    df_cleaned = df.drop(outlier_indices_rainfall).reset_index(drop=True)
    print(f"\nRemoved {len(outlier_indices_rainfall)} outliers from 'Rainfall'.")
    print("Shape of DataFrame after removal:", df_cleaned.shape)
else:
    df_cleaned = df.copy()
    print("\nNo outliers detected in 'Rainfall' or no outliers removed.")

In [None]:
 x = df.drop('Rainfall', axis=1)
 y = df['Rainfall']

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
x_poly = poly.fit_transform(x)

try:
    feature_names = poly.get_feature_names_out(X.columns)
    x_poly_df = pd.DataFrame(x_poly, columns=feature_names)
except:
    x_poly_df = pd.DataFrame(x_poly)

print("Original features:\n", x.head())
print("Polynomial & interaction features:\n", x_poly_df.head())

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_poly_df, y, test_size=0.2, random_state=42)

In [None]:
def evaluate_model(name, model, x_test, y_test):
    predictions = model.predict(x_test)

    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print(f"Metrics for {name}")
    print(f"Mean Squared Error (MSE)      : {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE) : {rmse:.4f}")
    print(f"Mean Absolute Error (MAE)     : {mae:.4f}")
    print(f"R-squared (R2)                : {r2:.4f}")
    print("------------------------")

    return {
        "Model": name,
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2
    }

metrics = []

In [None]:
pipeline_lr = Pipeline([('LinearRegression', LinearRegression())])
pipeline_lr.fit(x_train, y_train)
metrics.append(evaluate_model("Linear Regression", pipeline_lr, x_test, y_test))

In [None]:
y_pred_lr = pipeline_lr.predict(x_test)
residuals_lr = y_test - y_pred_lr

plt.figure(figsize=(8, 6))
sns.histplot(residuals_lr, kde=True)
plt.title('Residuals Distribution for LinearRegression')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_lr, residuals_lr, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for LinearRegression')
plt.show()

In [None]:
pipeline_dt = Pipeline([('DecisionTreeRegressor', DecisionTreeRegressor())])
pipeline_dt.fit(x_train, y_train)
metrics.append(evaluate_model("DecisionTreeRegressor (Before Tuning)", pipeline_dt, x_test, y_test))

In [None]:
y_pred_dt = pipeline_dt.predict(x_test)
residuals_dt = y_test - y_pred_dt

plt.figure(figsize=(8, 6))
sns.histplot(residuals_dt, kde=True)
plt.title('Residuals Distribution for DecisionTreeRegressor')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_dt, residuals_dt, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for DecisionTreeRegressor')
plt.show()

In [None]:
pipeline_rf = Pipeline([('RandomForestRegressor', RandomForestRegressor())])
pipeline_rf.fit(x_train, y_train)
metrics.append(evaluate_model("RandomForestRegressor (Before Tuning)", pipeline_rf, x_test, y_test))

In [None]:
y_pred_rf = pipeline_rf.predict(x_test)
residuals_rf = y_test - y_pred_rf

plt.figure(figsize=(8, 6))
sns.histplot(residuals_rf, kde=True)
plt.title('Residuals Distribution for RandomForestRegressor')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_rf, residuals_rf, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for RandomForestRegressor')
plt.show()

In [None]:
pipeline_knn = Pipeline([('KNeighborsRegressor', KNeighborsRegressor())])
pipeline_knn.fit(x_train, y_train)
metrics.append(evaluate_model("KNeighborsRegressor (Before Tuning)", pipeline_knn, x_test, y_test))

In [None]:
y_pred_knn = pipeline_knn.predict(x_test)
residuals_knn = y_test - y_pred_knn

plt.figure(figsize=(8, 6))
sns.histplot(residuals_knn, kde=True)
plt.title('Residuals Distribution for KNeighborsRegressor')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_knn, residuals_knn, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for KNeighborsRegressor')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
bar_width = 0.15
df_metrics = pd.DataFrame(metrics)
bar_positions = [i for i in range(len(df_metrics["Model"]))]

regression_metrics = ["MSE", "RMSE", "MAE", "R2"]

for i, metric in enumerate(regression_metrics):
    ax.bar([p + bar_width * i for p in bar_positions], df_metrics[metric], width=bar_width, label=metric)

ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Performance Metrics for Different Models')
ax.set_xticks([p + bar_width * (len(regression_metrics) - 1) / 2 for p in bar_positions])
ax.set_xticklabels(df_metrics["Model"], rotation=45, ha='right')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {}
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'max_depth': [None, 3, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5]
        }
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    }
}

In [None]:
best_models = {}

for name, mp in models.items():
    print(f"\nTraining and tuning {name}...")
    grid = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(x_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"Best parameters for {name}: {grid.best_params_}")

    y_pred = grid.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    print(f"{name} Performance on Test Data:")
    print(f"R²: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")

    plt.figure(figsize=(6, 4))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
    plt.xlabel('Actual Rainfall')
    plt.ylabel('Predicted Rainfall')
    plt.title(f'{name} - Actual vs Predicted Rainfall')
    plt.show()

In [None]:
tuned_metrics = []

dt_tuned_metrics = evaluate_model("DecisionTreeRegressor (Tuned)", best_models['DecisionTreeRegressor'], x_test, y_test)
tuned_metrics.append(dt_tuned_metrics)

rf_tuned_metrics = evaluate_model("RandomForestRegressor (Tuned)", best_models['RandomForestRegressor'], x_test, y_test)
tuned_metrics.append(rf_tuned_metrics)

knn_tuned_metrics = evaluate_model("KNeighborsRegressor (Tuned)", best_models['KNeighborsRegressor'], x_test, y_test)
tuned_metrics.append(knn_tuned_metrics)

lr_metrics = next((item for item in metrics if item["Model"] == "Linear Regression"), None)
if lr_metrics:
    tuned_metrics.append(lr_metrics)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
bar_width = 0.15
df_tuned_metrics = pd.DataFrame(tuned_metrics)
bar_positions = [i for i in range(len(df_tuned_metrics["Model"]))]

regression_metrics = ["MSE", "RMSE", "MAE", "R2"]

for i, metric in enumerate(regression_metrics):
     ax.bar([p + bar_width * i for p in bar_positions], df_tuned_metrics[metric], width=bar_width, label=metric)

ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Performance Metrics for Tuned Models')
ax.set_xticks([p + bar_width * (len(regression_metrics) - 1) / 2 for p in bar_positions])
ax.set_xticklabels(df_tuned_metrics["Model"], rotation=45, ha='right')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
from pycaret.regression import setup, compare_models, create_model, tune_model, evaluate_model, finalize_model

In [None]:
reg_setup = setup(data = df_pycaret, target = 'Rainfall', session_id=123)

In [None]:
best_model = compare_models()

In [None]:
evaluate_model(best_model)