In [1]:
# %%
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix
import joblib

# -----------------------------
# Load dataset
# !!! IMPORTANT: Replace "forest_cover_dataset.csv" with the actual path to your dataset file.
# This dataset is expected to have features as described in "Forest Cover Type Prediction.pdf"
# and the target column named "Cover_Type".
# Wilderness_Area (4 binary columns) and Soil_Type (40 binary columns) are assumed
# to be already in their binary/dummy variable format.
try:
    df = pd.read_csv("train.csv")
except FileNotFoundError:
    print("Error: 'train.csv' not found. Please replace with your actual dataset file path.")
    exit()

# -----------------------------
# Basic Preprocessing (Minimal, assuming data is mostly ready as per PDF description)

# Example: If there are any non-numeric identifier columns not part of features or target, drop them.
# For this problem, based on the PDF, all described columns are either features or the target.
# If your CSV has an ID column, for example, you would drop it here:
# if 'Id' in df.columns:
#     df.drop(columns=['Id'], inplace=True)

# Handle missing values - a simple approach is to drop rows with any NaNs.
# Consider more sophisticated imputation if appropriate for your dataset.
df = df.dropna()

# Ensure target variable 'Cover_Type' is integer (as per PDF description [cite: 2])
if 'Cover_Type' in df.columns:
    df['Cover_Type'] = df['Cover_Type'].astype(int)
else:
    print("Error: Target column 'Cover_Type' not found in the dataset.")
    exit()

# -----------------------------
# Features and Target
# 'Cover_Type' is the target variable [cite: 3]
# All other columns are features.
if 'Cover_Type' not in df.columns:
    print("Error: Target column 'Cover_Type' is missing from the dataset.")
    exit()

X = df.drop(columns=['Cover_Type'])
y = df['Cover_Type']

# Check for any NaNs left after potential dropna
assert X.isnull().sum().sum() == 0, "There are still NaN values in features X."
assert y.isnull().sum() == 0, "There are still NaN values in target y."

# -----------------------------
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() > 1 else None)

# -----------------------------
# Scale features
# Features like Elevation, Aspect, Slope, Distances, Hillshades are numerical [cite: 1]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# =============================
# 1. Logistic Regression
# =============================
print("\n" + "="*30)
print("LOGISTIC REGRESSION")
print("="*30)
# For multi-class problems, 'ovr' (One-vs-Rest) or 'multinomial' can be used.
# 'lbfgs' solver supports multinomial.
logreg = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000, random_state=42)
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]} # Expanded C for potentially more complex data
grid_lr = GridSearchCV(logreg, param_grid_lr, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_lr.fit(X_train, y_train)
y_pred_lr = grid_lr.predict(X_test)

print("Best Params:", grid_lr.best_params_)
acc_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {acc_lr:.4f}")
# MSE is not a primary metric for classification, but included for structural consistency with the example.
print(f"MSE: {mean_squared_error(y_test, y_pred_lr):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

# =============================
# 2. Decision Tree
# =============================
print("\n" + "="*30)
print("DECISION TREE")
print("="*30)
dt = DecisionTreeClassifier(random_state=42)
param_grid_dt = {'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
grid_dt = GridSearchCV(dt, param_grid_dt, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_dt.fit(X_train, y_train)
y_pred_dt = grid_dt.predict(X_test)

print("Best Params:", grid_dt.best_params_)
acc_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy: {acc_dt:.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_dt):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

# =============================
# 3. Random Forest
# =============================
print("\n" + "="*30)
print("RANDOM FOREST")
print("="*30)
rf = RandomForestClassifier(random_state=42)
# Adjusted parameters for potentially larger/more complex dataset
param_grid_rf = {'n_estimators': [100, 200],
                 'max_depth': [10, 20, None],
                 'min_samples_split': [2, 5],
                 'min_samples_leaf': [1, 2]}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_rf.fit(X_train, y_train)
y_pred_rf = grid_rf.predict(X_test)

print("Best Params:", grid_rf.best_params_)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {acc_rf:.4f}")
print(f"MSE: {mean_squared_error(y_test, y_pred_rf):.4f}") # Included for consistency
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# =============================
# 📊 Accuracy Comparison
# =============================
print("\n" + "="*30)
print("MODEL ACCURACY COMPARISON")
print("="*30)
print(f"Logistic Regression Accuracy: {acc_lr:.4f}")
print(f"Decision Tree Accuracy     : {acc_dt:.4f}")
print(f"Random Forest Accuracy     : {acc_rf:.4f}")

# =============================
# 💾 Save the best model
# =============================
# Determine the best model based on accuracy
models_accuracies = {
    "LogisticRegression": (grid_lr.best_estimator_, acc_lr),
    "DecisionTree": (grid_dt.best_estimator_, acc_dt),
    "RandomForest": (grid_rf.best_estimator_, acc_rf)
}

best_model_name = max(models_accuracies, key=lambda k: models_accuracies[k][1])
best_model_estimator = models_accuracies[best_model_name][0]
best_model_accuracy = models_accuracies[best_model_name][1]

model_filename = f"best_forest_cover_model_{best_model_name}_{best_model_accuracy:.4f}.pkl"
joblib.dump(best_model_estimator, model_filename)
print(f"\n✅ Best model ({best_model_name} with accuracy {best_model_accuracy:.4f}) saved to: {model_filename}")

# %%


LOGISTIC REGRESSION
Fitting 5 folds for each of 5 candidates, totalling 25 fits




Best Params: {'C': 10}
Accuracy: 0.6667
MSE: 3.0615
Classification Report:
               precision    recall  f1-score   support

           1       0.63      0.65      0.64       432
           2       0.57      0.44      0.50       432
           3       0.56      0.51      0.54       432
           4       0.81      0.91      0.86       432
           5       0.62      0.68      0.65       432
           6       0.58      0.60      0.59       432
           7       0.86      0.87      0.87       432

    accuracy                           0.67      3024
   macro avg       0.66      0.67      0.66      3024
weighted avg       0.66      0.67      0.66      3024

Confusion Matrix:
 [[280  71   1   0  25   1  54]
 [102 192  19   0  95  19   5]
 [  0   3 222  60  17 130   0]
 [  0   0  21 393   0  18   0]
 [ 11  63  42   0 293  23   0]
 [  0  10  91  33  38 260   0]
 [ 53   0   1   0   2   0 376]]

DECISION TREE
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Params: 