# Late Blight Risk Prediction - Google Colab Notebook
This notebook loads and prepares your dataset, applies SMOTE to address class imbalance, trains a Random Forest model, and evaluates its performance.

In [None]:
!pip install imbalanced-learn

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
print("Libraries imported successfully.")


In [None]:

csv_filename = "Lateblight_Data_UNIQUE_Planting_Months.csv"
try:
    df = pd.read_csv(csv_filename)
    print(f"Dataset '{csv_filename}' loaded successfully. First 5 rows:")
    display(df.head())
except FileNotFoundError:
    print(f"ERROR: File '{csv_filename}' not found. Please upload it to the Colab session.")


In [None]:

if 'df' in locals():
    columns_to_drop = ["Date", "Variety_Notes", "Environmental_Impact", "Planting_Months"]
    df = df.drop(columns=columns_to_drop, errors='ignore')
    print(f"Dropped columns (if they existed): {columns_to_drop}")
    original_rows = len(df)
    df = df.dropna(subset=["Blight_Risk"])
    rows_after_na_drop = len(df)
    print(f"Dropped {original_rows - rows_after_na_drop} rows with missing 'Blight_Risk'.")
    print("\nRemaining columns:", list(df.columns))
    print(f"Dataset shape after cleaning: {df.shape}")
else:
    print("Skipping data cleaning as DataFrame 'df' was not loaded.")


In [None]:

if 'df' in locals():
    label_encoder = LabelEncoder()
    df["Blight_Risk_Encoded"] = label_encoder.fit_transform(df["Blight_Risk"])
    print("Target variable 'Blight_Risk' encoded into 'Blight_Risk_Encoded'.")
    print("Mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
    X = pd.get_dummies(df.drop(columns=["Blight_Risk", "Blight_Risk_Encoded"]))
    y = df["Blight_Risk_Encoded"]
    print("\nFeatures prepared (One-Hot Encoded). Shape of X:", X.shape)
    print("Target prepared. Shape of y:", y.shape)
    print("\nFeature columns after One-Hot Encoding (first 15 shown):")
    print(list(X.columns[:15]))
else:
    print("Skipping encoding as DataFrame 'df' was not loaded.")


In [None]:

if 'X' in locals() and 'y' in locals():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print("Data split into training and testing sets.")
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)
    print("\nOriginal class distribution in y_train (counts):")
    print(y_train.value_counts())
    print("\nOriginal class distribution in y_train (proportions):")
    print(y_train.value_counts(normalize=True))
    print("\nOriginal class distribution in y_test (counts):")
    print(y_test.value_counts())
else:
    print("Skipping Train/Test split as X and y were not defined.")


In [None]:

if 'X_train' in locals() and 'y_train' in locals():
    min_samples = y_train.value_counts().min()
    print(f"\nSmallest class in y_train has {min_samples} samples.")
    smote_k_neighbors = min(5, min_samples - 1)
    if smote_k_neighbors < 1:
        print("Warning: Minority class has only 1 sample. Cannot apply SMOTE with k>0.")
        X_train_res, y_train_res = X_train, y_train
    else:
        print(f"Applying SMOTE with k_neighbors = {smote_k_neighbors}")
        smote = SMOTE(random_state=42, k_neighbors=smote_k_neighbors)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        print("SMOTE applied successfully to the training data.")
    print("\nX_train_res shape:", X_train_res.shape)
    print("y_train_res shape:", y_train_res.shape)
    print("\nClass distribution in y_train_res (after SMOTE):")
    print(pd.Series(y_train_res).value_counts())
    print(pd.Series(y_train_res).value_counts(normalize=True))
else:
    print("Skipping SMOTE as X_train and y_train were not defined.")


In [None]:

if 'X_train_res' in locals() and 'y_train_res' in locals():
    model = RandomForestClassifier(random_state=42, n_estimators=100)
    model.fit(X_train_res, y_train_res)
    print("Random Forest model trained successfully.")
else:
    print("Skipping model training as resampled training data is not available.")


In [None]:

if 'model' in locals() and 'X_test' in locals() and 'y_test' in locals():
    y_pred = model.predict(X_test)
    print("\nClassification Report (on Test Set):")
    if 'label_encoder' in locals():
        print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
        print("\nConfusion Matrix (on Test Set):")
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, index=label_encoder.classes_, columns=label_encoder.classes_)
        display(cm_df)
    else:
        print("LabelEncoder not found. Cannot display class names in report/matrix.")
        print(classification_report(y_test, y_pred))
        print("\nConfusion Matrix (numeric labels):")
        print(confusion_matrix(y_test, y_pred))
else:
    print("Skipping model evaluation as model or test data is not available.")
