<a href="https://colab.research.google.com/github/MUSA-650/musa-650-spring-2025/blob/main/s1_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [2]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
column_names = [
    "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
    "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
    *["Wilderness_Area_" + str(i) for i in range(4)],
    *["Soil_Type_" + str(i) for i in range(40)],
    "Cover_Type"
]
df = pd.read_csv(url, compression='gzip', header=None, names=column_names)

In [3]:
# Separate features and target
X = df.drop("Cover_Type", axis=1)
y = df["Cover_Type"]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train, validation, and test sets
train_size = 11340
validation_size = 3780
test_size = 565892

X_train = X_scaled[:train_size]
y_train = y[:train_size]

X_validation = X_scaled[train_size:train_size + validation_size]
y_validation = y[train_size:train_size + validation_size]

X_test = X_scaled[train_size + validation_size:]
y_test = y[train_size + validation_size:]


In [4]:
# Parameter tuning using GridSearchCV (with validation set)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1],
    'kernel': ['rbf']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=[(slice(None), slice(0, len(y_validation)))], verbose=2, n_jobs=-1) #using the defined validation set as cv
grid_search.fit(X_train, y_train)

# Best SVM model
best_svm = grid_search.best_estimator_

# Predictions and evaluation on the test set
y_pred = best_svm.predict(X_test)
print(classification_report(y_test, y_pred))

print("Best Parameters:", grid_search.best_params_)


Fitting 1 folds for each of 12 candidates, totalling 12 fits


ValueError: String indexing is not supported with 'axis=0'

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_percent = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100


# Plotting the Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix_percent, annot=True, fmt=".1f", cmap="Blues")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()
