In [6]:
#Importing modiles
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [5]:
if not os.path.exists(CSV_FILENAME):
    raise FileNotFoundError(
        f"Could not find '{CSV_FILENAME}'.\n"
        "In Google Colab: left sidebar -> Files -> Upload the CSV.\n"
        "Then re-run this cell."
    )

df = pd.read_csv(CSV_FILENAME)

print("Shape:", df.shape)
df.head()

Shape: (1727, 7)


Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [8]:
print("\nColumns:", df.columns.tolist())
print("\nMissing values per column:\n", df.isna().sum())
print("\nTarget distribution (value counts):")

#Trying to infer target column name. Common: "class" or last column.
target_col = None
for c in df.columns:
    if str(c).strip().lower() in ["class", "target", "label", "acceptability", "decision"]:
        target_col = c
        break
if target_col is None:
    target_col = df.columns[-1]

print("Using target column:", target_col)
print(df[target_col].value_counts(dropna=False))


Columns: ['vhigh', 'vhigh.1', '2', '2.1', 'small', 'low', 'unacc']

Missing values per column:
 vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64

Target distribution (value counts):
Using target column: unacc
unacc
unacc    1209
acc       384
good       69
vgood      65
Name: count, dtype: int64


In [11]:
#Split features/target
X = df.drop(columns=[target_col])
y = df[target_col]

#Identify categorical columns
cat_cols = [c for c in X.columns if X[c].dtype == "object" or str(X[c].dtype).startswith("category")]
num_cols = [c for c in X.columns if c not in cat_cols]

print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Categorical columns: ['vhigh', 'vhigh.1', '2', '2.1', 'small', 'low']
Numeric columns: []
Train shape: (1381, 6) Test shape: (346, 6)


In [12]:
#Preprocessing + Baseline model (Decision Tree)
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"
)

dt_model = DecisionTreeClassifier(random_state=RANDOM_STATE)

dt_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", dt_model)
])

dt_pipeline.fit(X_train, y_train)

dt_pred = dt_pipeline.predict(X_test)

dt_acc = accuracy_score(y_test, dt_pred)
print("Decision Tree Accuracy:", round(dt_acc, 4))
print("\nClassification Report (Decision Tree):\n")
print(classification_report(y_test, dt_pred))

print("\nConfusion Matrix (Decision Tree):\n")
print(confusion_matrix(y_test, dt_pred))


Decision Tree Accuracy: 0.9769

Classification Report (Decision Tree):

              precision    recall  f1-score   support

         acc       0.99      0.91      0.95        77
        good       0.88      1.00      0.93        14
       unacc       0.98      1.00      0.99       242
       vgood       1.00      1.00      1.00        13

    accuracy                           0.98       346
   macro avg       0.96      0.98      0.97       346
weighted avg       0.98      0.98      0.98       346


Confusion Matrix (Decision Tree):

[[ 70   2   5   0]
 [  0  14   0   0]
 [  1   0 241   0]
 [  0   0   0  13]]


In [13]:
#Improved model (Random Forest)
rf_model = RandomForestClassifier(
    n_estimators=300,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", rf_model)
])

rf_pipeline.fit(X_train, y_train)

rf_pred = rf_pipeline.predict(X_test)

rf_acc = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", round(rf_acc, 4))
print("\nClassification Report (Random Forest):\n")
print(classification_report(y_test, rf_pred))

print("\nConfusion Matrix (Random Forest):\n")
print(confusion_matrix(y_test, rf_pred))


Random Forest Accuracy: 0.9798

Classification Report (Random Forest):

              precision    recall  f1-score   support

         acc       0.94      0.97      0.96        77
        good       1.00      0.86      0.92        14
       unacc       0.99      0.99      0.99       242
       vgood       1.00      0.92      0.96        13

    accuracy                           0.98       346
   macro avg       0.98      0.94      0.96       346
weighted avg       0.98      0.98      0.98       346


Confusion Matrix (Random Forest):

[[ 75   0   2   0]
 [  2  12   0   0]
 [  2   0 240   0]
 [  1   0   0  12]]


In [14]:
#Comparison summary
summary = pd.DataFrame({
    "Model": ["Decision Tree", "Random Forest"],
    "Accuracy": [dt_acc, rf_acc]
}).sort_values("Accuracy", ascending=False)

summary


Unnamed: 0,Model,Accuracy
1,Random Forest,0.979769
0,Decision Tree,0.976879
