In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
)

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [7]:
from ucimlrepo import fetch_ucirepo
dataset = fetch_ucirepo(id=10)
df = pd.concat([dataset.data.features, dataset.data.targets], axis=1).dropna()
target_col = None
num_col = []
cat_col = []
for index, row in dataset.variables.iterrows():
    if row["role"] == "Target":
            target_col = row["name"]
    elif row["type"] == "Continuous" or row["type"] == "Integer":
            num_col.append(row["name"])
    else:
            cat_col.append(row["name"])
df

Unnamed: 0,price,highway-mpg,city-mpg,peak-rpm,horsepower,compression-ratio,stroke,bore,fuel-system,engine-size,...,wheel-base,engine-location,drive-wheels,body-style,num-of-doors,aspiration,fuel-type,make,normalized-losses,symboling
3,13950.0,30,24,5500.0,102.0,10.0,3.40,3.19,mpfi,109,...,99.8,front,fwd,sedan,4.0,std,gas,audi,164.0,2
4,17450.0,22,18,5500.0,115.0,8.0,3.40,3.19,mpfi,136,...,99.4,front,4wd,sedan,4.0,std,gas,audi,164.0,2
6,17710.0,25,19,5500.0,110.0,8.5,3.40,3.19,mpfi,136,...,105.8,front,fwd,sedan,4.0,std,gas,audi,158.0,1
8,23875.0,20,17,5500.0,140.0,8.3,3.40,3.13,mpfi,131,...,105.8,front,fwd,sedan,4.0,turbo,gas,audi,158.0,1
10,16430.0,29,23,5800.0,101.0,8.8,2.80,3.50,mpfi,108,...,101.2,front,rwd,sedan,2.0,std,gas,bmw,192.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,16845.0,28,23,5400.0,114.0,9.5,3.15,3.78,mpfi,141,...,109.1,front,rwd,sedan,4.0,std,gas,volvo,95.0,-1
201,19045.0,25,19,5300.0,160.0,8.7,3.15,3.78,mpfi,141,...,109.1,front,rwd,sedan,4.0,turbo,gas,volvo,95.0,-1
202,21485.0,23,18,5500.0,134.0,8.8,2.87,3.58,mpfi,173,...,109.1,front,rwd,sedan,4.0,std,gas,volvo,95.0,-1
203,22470.0,27,26,4800.0,106.0,23.0,3.40,3.01,idi,145,...,109.1,front,rwd,sedan,4.0,turbo,diesel,volvo,95.0,-1


In [6]:
def main_func(ucirepo_id, model_name):
    dataset = fetch_ucirepo(id=ucirepo_id)
    df = pd.concat([dataset.data.features, dataset.data.targets], axis=1).dropna()

    target_col = None
    num_col = []
    cat_col = []

    for index, row in dataset.variables.iterrows():
        if row["role"] == "Target":
            target_col = row["name"]
        elif row["type"] == "Continuous" or row["type"] == "Integer":
            num_col.append(row["name"])
        else:
            cat_col.append(row["name"])

    X = df.drop(columns=[target_col])
    y = df[target_col]

    num_imputer = SimpleImputer(strategy="mean")
    cat_encoder = OneHotEncoder(handle_unknown="ignore")
    scaler = StandardScaler()

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_imputer, num_col),
            ("cat", cat_encoder, cat_col),
        ]
    )

    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("scaler", scaler)])

    X = pipeline.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    if model_name == "Decision Tree":
        model = DecisionTreeClassifier(criterion="entropy")
        model.fit(X_train, y_train)
        height = model.tree_.max_depth
    elif model_name == "KNN Classifier":
        model = KNeighborsClassifier()
        model.fit(X_train, y_train)
        height = None 
    elif model_name == "Bayes":
        model = GaussianNB()
        model.fit(X_train, y_train)
        height = None
    elif model_name == "SVM":
        model = SVC()
        model.fit(X_train, y_train)
        height = None
    elif model_name == "Cart":
        model = DecisionTreeClassifier()
        model.fit(X_train, y_train)
        height = model.tree_.max_depth

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")

    metrics = {"Accuracy": accuracy, "Precision": precision, "Recall": recall}

    return metrics, height


In [8]:
if __name__ == "__main__":
    models = ["Decision Tree", "KNN Classifier", "Bayes", "SVM", "Cart"]
    ucirepo_ids = [45, 10, 109]

    final_results = []

    for ucirepo_id in ucirepo_ids:
        aggregated_metrics = {
            "Accuracy": [],
            "Precision": [],
            "Recall": [],
            "Height": [],
        }
        for model in models:
            metrics, height = main_func(ucirepo_id, model)
            for metric, value in metrics.items():
                aggregated_metrics[metric].append(value)

            result = {
                "Dataset_ID": ucirepo_id,
                "Model": model,
                "Accuracy": metrics["Accuracy"],
                "Precision": metrics["Precision"],
                "Recall": metrics["Recall"],
                "Height": height,
            }
            final_results.append(result)

    final_df = pd.DataFrame(final_results)
    final_df.to_csv("training_result.csv", index=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
