# Mini Project

**Here we download and preprocess all the (21) datasets from UCI we use in this project**

In [23]:
### Used 21 datasets:
### adult
### balloons
### breast-cancer
### bc-wisc
### bc-wisc-diag
### bc-wisc-prog
### chess-krvkp
### congress-voting
### conn-bench-sonar 
### connect-4
### credit-approval
### fertility
### haberman-survival
### mammographic
### ionosphere
### iris
### magic
### dermatology
### ecoli
### glass_identification
### letter_recognition

In [24]:
!pip install ucimlrepo



In [25]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, adjusted_rand_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed

In [26]:
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.simplefilter('ignore')
warnings.filterwarnings(action='ignore', category=DataConversionWarning, module='sklearn.utils.validation')

In [27]:
#utility functions
def preprocess_dataset(df,categorical_columns,numeric_columns,binary_target=[]):
    for column in categorical_columns:
            if len(binary_target) != 0 and column in binary_target:
                df[column].fillna('unknown', inplace=True)
                continue
            df[column].fillna(df[column].mode()[0], inplace=True)
    for num in numeric_columns:
        df[num].fillna(df[num].mean(), inplace=True)
        
    if categorical_columns:
        encoder = OneHotEncoder(drop='first', sparse=False)
        encoded_values = encoder.fit_transform(df[categorical_columns])
        encoded_df = pd.DataFrame(encoded_values, columns=encoder.get_feature_names_out(categorical_columns))
        df = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)

    if numeric_columns:
        scaler = StandardScaler()
        df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
    
    return df, X_train, X_test, y_train, y_test

def logistic_regression(X_train,y_train,X_test,y_test):
    print("Logistic regression training and evaluation")
    param_grid_lr = {
        'C': [0.1, 1],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear' ]
    }
    lr = LogisticRegression(max_iter=100, random_state=42)
    grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=3, scoring='accuracy')
    grid_search_lr.fit(X_train, y_train)
    lr_best = grid_search_lr.best_estimator_
    lr_pred = lr_best.predict(X_test)
    lr_acc = accuracy_score(y_test, lr_pred)
    print(f"Best accuracy: {lr_acc}\n")
    return lr_acc

def random_forest(X_train,y_train,X_test,y_test):
    print("RF training and evaluation")
    param_grid_rf = {
        'n_estimators': [50, 100],
        'max_depth': [5, 10],
        'min_samples_split': [2, 5]
    }
    rf = RandomForestClassifier(random_state=42)
    grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=3, scoring='accuracy')
    grid_search_rf.fit(X_train, y_train)
    rf_best = grid_search_rf.best_estimator_
    rf_pred = rf_best.predict(X_test)
    rf_acc = accuracy_score(y_test, rf_pred)
    print(f"Best accuracy: {rf_acc}\n")
    return rf_acc

def svm(X_train,y_train,X_test,y_test):
    print("SVM training and evaluation")
    param_dist = {
        'C': [0.1, 1],
        'kernel': ['rbf']
    }

    svm = SVC(random_state=42)
    randomized_search_svm = RandomizedSearchCV(
        estimator=svm,
        param_distributions=param_dist,
        n_iter=50,
        cv=3,
        scoring='accuracy',
        random_state=42,
        n_jobs=-1
    )

    randomized_search_svm.fit(X_train, y_train)
    
    svm_best = randomized_search_svm.best_estimator_
    svm_pred = svm_best.predict(X_test)
    svm_acc = accuracy_score(y_test, svm_pred)
    print(f"Best accuracy: {svm_acc}\n")
    return svm_acc
                

def k_means(X_scaled,X_test,y_test,clusters_max_num):
    print("K-Means training and evaluation")
    y_test = y_test.to_numpy().ravel()
    param_grid_kmeans = {
        'n_clusters': range(2, clusters_max_num) 
    }
    kmeans = KMeans(random_state=42)
    grid_search_kmeans = GridSearchCV(kmeans, param_grid_kmeans, cv=3, scoring='accuracy')
    grid_search_kmeans.fit(X_scaled)
    kmeans_best = grid_search_kmeans.best_estimator_
    kmeans_labels = kmeans_best.fit_predict(X_test)
    kmeans_acc = adjusted_rand_score(y_test, kmeans_labels)
    print(f"Best accuracy: {kmeans_acc}\n")
    return kmeans_acc

def gmm_clustering(X_train, X_test, y_test,clusters_max_num):
    print("Gaussian Mixture Model training and evaluation")
    y_test = y_test.to_numpy().ravel()
    best_score = float('-inf')
    best_params = None

    for n_components in range(2, clusters_max_num): 
        model = GaussianMixture(n_components=n_components, random_state=42)
        labels = model.fit_predict(X_test)
        score = adjusted_rand_score(y_test, labels)
        if score > best_score:
            best_score = score
            best_params = {'n_components': n_components}

    print(f"Best accuracy: {best_score}\n")
    return best_score
    

def knn(X_train,y_train,X_test,y_test):
    print("KNN training and evaluation")
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)
    knn_acc = accuracy_score(y_test, knn_pred)
    print(f"Best accuracy: {knn_acc}\n")
    return knn_acc


def train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num):
    global results
    
    #Logistic Regression
    lr_acc = logistic_regression(X_train,y_train,X_test,y_test)
    results.append({'Dataset': dataset_name, 'Model': 'Logistic Regression', 'Accuracy': lr_acc})
    
    #Random Forest
    rf_acc = random_forest(X_train,y_train,X_test,y_test)
    results.append({'Dataset': dataset_name, 'Model': 'Random Forest', 'Accuracy': rf_acc})
    
    #SVM
    svm_acc = svm(X_train,y_train,X_test,y_test)
    results.append({'Dataset': dataset_name, 'Model': 'SVM', 'Accuracy': svm_acc})
    
    #K-Means
    kmeans_acc = k_means(X,X_test,y_test,clusters_max_num)
    results.append({'Dataset': dataset_name, 'Model': 'K-Means', 'Accuracy': kmeans_acc})
    
    #GMM
    gmm_acc = gmm_clustering(X,X_test,y_test,clusters_max_num)
    results.append({'Dataset': dataset_name, 'Model': 'GMM', 'Accuracy': gmm_acc})
    
    #KNN
    knn_acc = knn(X_train,y_train,X_test,y_test)
    results.append({'Dataset': dataset_name, 'Model': 'KNN', 'Accuracy': knn_acc})

# Training and evaluating models

In [28]:
results = []

**adult**

In [29]:
#adult 
dataset_name = 'adult'
adult = fetch_ucirepo(id=2)
X = adult.data.features
y = adult.data.targets

categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 
                       'relationship', 'race', 'sex', 'native-country']

numeric_columns = ['age', 'capital-gain', 'capital-loss', 'hours-per-week','fnlwgt','education-num']

clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.5674071041048214

RF training and evaluation
Best accuracy: 0.5766199201555943

SVM training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.5796908588391851

K-Means training and evaluation
Best accuracy: 0.10756979542519787

Gaussian Mixture Model training and evaluation
Best accuracy: 0.011882367361161295

KNN training and evaluation
Best accuracy: 0.5129491247824752



**balloons**

In [30]:
#baloons
dataset_name = 'balloons'
balloons = fetch_ucirepo(id=13) 

X = balloons.data.features 
y = balloons.data.targets 

categorical_columns = ['color', 'size', 'act', 'age']

numeric_columns = []
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.5

RF training and evaluation
Best accuracy: 1.0

SVM training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.5

K-Means training and evaluation
Best accuracy: 0.0

Gaussian Mixture Model training and evaluation
Best accuracy: 0.0

KNN training and evaluation
Best accuracy: 0.75



**breast_cancer**

In [31]:
#breast_cancer
dataset_name = 'breast_cancer'
breast_cancer = fetch_ucirepo(id=14) 

X = breast_cancer.data.features 
y = breast_cancer.data.targets 

categorical_columns = [
    'age',
    'menopause',
    'tumor-size',
    'inv-nodes',
    'node-caps',
    'breast',
    'breast-quad',
    'irradiat']

numeric_columns = ['deg-malig']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.6724137931034483

RF training and evaluation
Best accuracy: 0.6896551724137931

SVM training and evaluation
Best accuracy: 0.6551724137931034

K-Means training and evaluation
Best accuracy: 0.012596848152637959

Gaussian Mixture Model training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.012596848152637959

KNN training and evaluation
Best accuracy: 0.6724137931034483



**breast_cancer_wisconsin_original**

In [32]:
#breast_cancer_wisconsin_original
dataset_name = 'breast_cancer_wisconsin_original'
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 

X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets

categorical_columns = []

numeric_columns = ['Clump_thickness',
    'Uniformity_of_cell_size',
    'Uniformity_of_cell_shape',
    'Marginal_adhesion',
    'Single_epithelial_cell_size',
    'Bare_nuclei',
    'Bland_chromatin',
    'Normal_nucleoli',
    'Mitoses']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.9642857142857143

RF training and evaluation
Best accuracy: 0.9571428571428572

SVM training and evaluation
Best accuracy: 0.9642857142857143

K-Means training and evaluation
Best accuracy: 0.8865117575484694

Gaussian Mixture Model training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.756314248952803

KNN training and evaluation
Best accuracy: 0.9714285714285714



**breast_cancer_wisconsin_diagnostic**

In [33]:
#breast_cancer_wisconsin_diagnostic
dataset_name = 'breast_cancer_wisconsin_diagnostic'
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets

categorical_columns = []

numeric_columns = ['radius1',
    'texture1',
    'perimeter1',
    'area1',
    'smoothness1',
    'compactness1',
    'concavity1',
    'concave_points1',
    'symmetry1',
    'fractal_dimension1',
    'radius2',
    'texture2',
    'perimeter2',
    'area2',
    'smoothness2',
    'compactness2',
    'concavity2',
    'concave_points2',
    'symmetry2',
    'fractal_dimension2',
    'radius3',
    'texture3',
    'perimeter3',
    'area3',
    'smoothness3',
    'compactness3',
    'concavity3',
    'concave_points3',
    'symmetry3',
    'fractal_dimension3']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.9649122807017544

RF training and evaluation
Best accuracy: 0.9649122807017544

SVM training and evaluation
Best accuracy: 0.9736842105263158

K-Means training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.6447692016900989

Gaussian Mixture Model training and evaluation
Best accuracy: 0.773636711329108

KNN training and evaluation
Best accuracy: 0.9473684210526315



**breast_cancer_wisconsin_prognostic**

In [34]:
#breast_cancer_wisconsin_prognostic
dataset_name = 'breast_cancer_wisconsin_prognostic'
breast_cancer_wisconsin_prognostic = fetch_ucirepo(id=16) 
X = breast_cancer_wisconsin_prognostic.data.features 
y = breast_cancer_wisconsin_prognostic.data.targets
categorical_columns = []

numeric_columns = ['Time',
    'radius1',
    'texture1',
    'perimeter1',
    'area1',
    'smoothness1',
    'compactness1',
    'concavity1',
    'concave_points1',
    'symmetry1',
    'fractal_dimension1',
    'radius2',
    'texture2',
    'perimeter2',
    'area2',
    'smoothness2',
    'compactness2',
    'concavity2',
    'concave_points2',
    'symmetry2',
    'fractal_dimension2',
    'radius3',
    'texture3',
    'perimeter3',
    'smoothness3',
    'compactness3',
    'concavity3',
    'concave_points3',
    'symmetry3',
    'fractal_dimension3',
    'tumor_size',
    'lymph_node_status' ]
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.8

RF training and evaluation
Best accuracy: 0.85

SVM training and evaluation
Best accuracy: 0.8

K-Means training and evaluation
Best accuracy: 0.06426735218508997

Gaussian Mixture Model training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.06426735218508997

KNN training and evaluation
Best accuracy: 0.725



**chess_king_rook_vs_king_pawn**

In [35]:
#chess_king_rook_vs_king_pawn
dataset_name = 'chess_king_rook_vs_king_pawn'
chess_king_rook_vs_king_pawn = fetch_ucirepo(id=22) 
X = chess_king_rook_vs_king_pawn.data.features 
y = chess_king_rook_vs_king_pawn.data.targets 

#had to reindex due to unique indexing issue
if not X.index.is_unique:
        X = X.reset_index(drop=True)


categorical_columns = ['bkblk',
    'bknwy',
    'bkon8',
    'bkona',
    'bkspr',
    'bkxbq',
    'bkxcr',
    'bkxwp',
    'blxwp',
    'bxqsq',
    'cntxt',
    'dsopp',
    'dwipd',
    'hdchk',
    'katri',
    'mulch',
    'qxmsq',
    'r2ar8',
    'reskd',
    'reskr',
    'rimmx',
    'rkxwp',
    'rxmsq',
    'simpl',
    'skach',
    'skewr',
    'skrxp',
    'spcop',
    'stlmt',
    'thrsk',
    'wkcti',
    'wkna8',
    'wknck',
    'wkovl',
    'wkpos']

numeric_columns = []
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.9484375

RF training and evaluation
Best accuracy: 0.946875

SVM training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.9453125

K-Means training and evaluation
Best accuracy: -0.00015732120830402502

Gaussian Mixture Model training and evaluation
Best accuracy: 0.007181239202725722

KNN training and evaluation
Best accuracy: 0.946875



**congressional_voting_records**

In [36]:
#congressional_voting_records
dataset_name = 'congressional_voting_records'
congressional_voting_records = fetch_ucirepo(id=105)  
X = congressional_voting_records.data.features 
y = congressional_voting_records.data.targets 

categorical_columns = ['handicapped-infants',
    'water-project-cost-sharing',
    'adoption-of-the-budget-resolution',
    'physician-fee-freeze',
    'el-salvador-aid',
    'religious-groups-in-schools',
    'anti-satellite-test-ban',
    'aid-to-nicaraguan-contras',
    'mx-missile',
    'immigration',
    'synfuels-corporation-cutback',
    'education-spending',
    'superfund-right-to-sue',
    'crime',
    'duty-free-exports',
    'export-administration-act-south-africa']

numeric_columns = []
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.9655172413793104

RF training and evaluation
Best accuracy: 0.9540229885057471

SVM training and evaluation
Best accuracy: 0.9540229885057471

K-Means training and evaluation
Best accuracy: 0.4525592658372991

Gaussian Mixture Model training and evaluation
Best accuracy: 0.4525592658372991

KNN training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.9080459770114943



**connectionist_bench_sonar_mines_vs_rocks**

In [37]:
#connectionist_bench_sonar_mines_vs_rocks
dataset_name = 'connectionist_bench_sonar_mines_vs_rocks'
connectionist_bench_sonar_mines_vs_rocks = fetch_ucirepo(id=151) 
X = connectionist_bench_sonar_mines_vs_rocks.data.features 
y = connectionist_bench_sonar_mines_vs_rocks.data.targets 

categorical_columns = []

numeric_columns = ['Attribute1',
    'Attribute2',
    'Attribute3',
    'Attribute4',
    'Attribute5',
    'Attribute6',
    'Attribute7',
    'Attribute8',
    'Attribute9',
    'Attribute10',
    'Attribute11',
    'Attribute12',
    'Attribute13',
    'Attribute14',
    'Attribute15',
    'Attribute16',
    'Attribute17',
    'Attribute18',
    'Attribute19',
    'Attribute20',
    'Attribute21',
    'Attribute22',
    'Attribute23',
    'Attribute24',
    'Attribute25',
    'Attribute26',
    'Attribute27',
    'Attribute28',
    'Attribute29',
    'Attribute30',
    'Attribute31',
    'Attribute32',
    'Attribute33',
    'Attribute34',
    'Attribute35',
    'Attribute36',
    'Attribute37',
    'Attribute38',
    'Attribute39',
    'Attribute40',
    'Attribute41',
    'Attribute42',
    'Attribute43',
    'Attribute44',
    'Attribute45',
    'Attribute46',
    'Attribute47',
    'Attribute48',
    'Attribute49',
    'Attribute50',
    'Attribute51',
    'Attribute52',
    'Attribute53',
    'Attribute54',
    'Attribute55',
    'Attribute56',
    'Attribute57',
    'Attribute58',
    'Attribute59',
    'Attribute60']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.7857142857142857

RF training and evaluation
Best accuracy: 0.8571428571428571

SVM training and evaluation
Best accuracy: 0.9047619047619048

K-Means training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.03258426966292135

Gaussian Mixture Model training and evaluation
Best accuracy: -0.003036949552893538

KNN training and evaluation
Best accuracy: 0.8809523809523809



**connect_4**

In [38]:
#connect_4
dataset_name = 'connect_4'
connect_4 = fetch_ucirepo(id=26) 
X = connect_4.data.features 
y = connect_4.data.targets

categorical_columns = [    'a1', 'a2', 'a3', 'a4', 'a5', 'a6',
    'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
    'c1', 'c2', 'c3', 'c4', 'c5', 'c6',
    'd1', 'd2', 'd3', 'd4', 'd5', 'd6',
    'e1', 'e2', 'e3', 'e4', 'e5', 'e6',
    'f1', 'f2', 'f3', 'f4', 'f5', 'f6',
    'g1', 'g2', 'g3', 'g4', 'g5', 'g6']

numeric_columns = []
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.7578448786264061

RF training and evaluation
Best accuracy: 0.7329040852575488

SVM training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.8442125518058022

K-Means training and evaluation
Best accuracy: 0.0026189347441208724

Gaussian Mixture Model training and evaluation
Best accuracy: 0.025256407350421298

KNN training and evaluation
Best accuracy: 0.7730905861456483



**credit_approval**

In [39]:
#credit_approval
dataset_name = 'credit_approval'
credit_approval = fetch_ucirepo(id=27) 
X = credit_approval.data.features 
y = credit_approval.data.targets 

categorical_columns = ['A13', 'A12', 'A10', 'A9', 'A7', 'A6', 'A5', 'A4', 'A1']

numeric_columns = ['A14', 'A8', 'A3', 'A2']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.8333333333333334

RF training and evaluation
Best accuracy: 0.8623188405797102

SVM training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.5869565217391305

K-Means training and evaluation
Best accuracy: 2.7218662929882256e-05

Gaussian Mixture Model training and evaluation
Best accuracy: 2.7218662929882256e-05

KNN training and evaluation
Best accuracy: 0.7391304347826086



**fertility**

In [40]:
#fertility
dataset_name = 'fertility'
fertility = fetch_ucirepo(id=244)  
X = fertility.data.features 
y = fertility.data.targets 

categorical_columns = ['season',
    'child_diseases',
    'accident',
    'surgical_intervention',
    'high_fevers',
    'alcohol',
    'smoking']

numeric_columns = ['age',
    'hrs_sitting']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.9

RF training and evaluation
Best accuracy: 0.9

SVM training and evaluation
Best accuracy: 0.9

K-Means training and evaluation
Best accuracy: 0.02145922746781116

Gaussian Mixture Model training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.07825994242456552

KNN training and evaluation
Best accuracy: 0.85



**haberman_s_survival**

In [41]:
#haberman_s_survival
dataset_name = 'haberman_s_survival'
haberman_s_survival = fetch_ucirepo(id=43) 
X = haberman_s_survival.data.features 
y = haberman_s_survival.data.targets 

categorical_columns = []

numeric_columns = ['age',
    'operation_year',
    'positive_auxillary_nodes']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.6935483870967742

RF training and evaluation
Best accuracy: 0.6935483870967742

SVM training and evaluation
Best accuracy: 0.6774193548387096

K-Means training and evaluation
Best accuracy: -0.014203859802808591

Gaussian Mixture Model training and evaluation
Best accuracy: 0.0617180688560148

KNN training and evaluation
Best accuracy: 0.7096774193548387



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


**mammographic_mass**

In [42]:
#mammographic_mass
dataset_name = 'mammographic_mass'
mammographic_mass = fetch_ucirepo(id=161) 
X = mammographic_mass.data.features 
y = mammographic_mass.data.targets 

categorical_columns = []

numeric_columns = ['BI-RADS',
    'Age',
    'Shape',
    'Margin',
    'Density']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.8497409326424871

RF training and evaluation
Best accuracy: 0.8393782383419689

SVM training and evaluation
Best accuracy: 0.8393782383419689

K-Means training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.38991309288976905

Gaussian Mixture Model training and evaluation
Best accuracy: 0.28124144629623055

KNN training and evaluation
Best accuracy: 0.8031088082901554



**ionosphere**

In [43]:
#ionosphere
dataset_name = 'ionosphere'
ionosphere = fetch_ucirepo(id=52)  
X = ionosphere.data.features 
y = ionosphere.data.targets 

categorical_columns = []

numeric_columns = ['Attribute1',
    'Attribute2',
    'Attribute3',
    'Attribute4',
    'Attribute5',
    'Attribute6',
    'Attribute7',
    'Attribute8',
    'Attribute9',
    'Attribute10',
    'Attribute11',
    'Attribute12',
    'Attribute13',
    'Attribute14',
    'Attribute15',
    'Attribute16',
    'Attribute17',
    'Attribute18',
    'Attribute19',
    'Attribute20',
    'Attribute21',
    'Attribute22',
    'Attribute23',
    'Attribute24',
    'Attribute25',
    'Attribute26',
    'Attribute27',
    'Attribute28',
    'Attribute29',
    'Attribute30',
    'Attribute31',
    'Attribute32',
    'Attribute33',
    'Attribute34']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.8591549295774648

RF training and evaluation
Best accuracy: 0.9436619718309859

SVM training and evaluation
Best accuracy: 0.9436619718309859

K-Means training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.1552982233502538

Gaussian Mixture Model training and evaluation
Best accuracy: 0.1552982233502538

KNN training and evaluation
Best accuracy: 0.8591549295774648



**iris**

In [44]:
#iris
dataset_name = 'iris'
iris = fetch_ucirepo(id=53) 
X = iris.data.features 
y = iris.data.targets 

categorical_columns = []

numeric_columns = [ 'sepal length',
    'sepal width',
    'petal length',
    'petal width']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.9666666666666667

RF training and evaluation
Best accuracy: 1.0

SVM training and evaluation
Best accuracy: 1.0

K-Means training and evaluation
Best accuracy: 0.5581490791566204

Gaussian Mixture Model training and evaluation
Best accuracy: 0.8104850421144351

KNN training and evaluation
Best accuracy: 1.0



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


**magic_gamma_telescope**

In [45]:
#magic_gamma_telescope
dataset_name = 'magic_gamma_telescope'
magic_gamma_telescope = fetch_ucirepo(id=159) 
X = magic_gamma_telescope.data.features 
y = magic_gamma_telescope.data.targets 

categorical_columns = []

numeric_columns = ['fLength',
    'fWidth',
    'fSize', 
    'fConc', 
    'fConc1', 
    'fAsym',
    'fM3Long', 
    'fM3Trans',
    'fAlpha',
    'fDist']
clusters_max_num = 4
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.7931125131440588

RF training and evaluation
Best accuracy: 0.869348054679285

SVM training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.870925341745531

K-Means training and evaluation
Best accuracy: 0.013716058289841156

Gaussian Mixture Model training and evaluation
Best accuracy: 0.14972177924414645

KNN training and evaluation
Best accuracy: 0.8346477392218717



**dermatology**

In [46]:
#dermatology
dataset_name = 'dermatology'
dermatology = fetch_ucirepo(id=33)  
X = dermatology.data.features 
y = dermatology.data.targets 

categorical_columns = ['family history']

numeric_columns = ['erythema', 'scaling', 'definite-borders', 'itching', 'koebner phenomenon',
    'polygonal papules', 'follicular papules', 'oral-mucosal involvement', 
    'knee elbow involvement', 'scalp involvement', 'melanin incontinence', 
    'eosinophils in the infiltrate', 'pnl infiltrate', 'fibrosis of the papillary dermis', 
    'exocytosis', 'acanthosis', 'hyperkeratosis', 'parakeratosis', 'clubbing of the rete ridges', 
    'elongation of the rete ridges', 'thinning of the suprapapillary epidermis', 
    'spongiform pustule', 'munro microabcess', 'focal hypergranulosis', 
    'disappearance of the granular layer', 'vacuolisation and damage of the basal layer', 
    'spongiosis', 'saw-tooth appearance of retes', 'follicular horn plug', 
    'perifollicular parakeratosis', 'inflammatory monoluclear infiltrate', 
    'band-like infiltrate', 'age']
clusters_max_num = 8
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.9864864864864865

RF training and evaluation
Best accuracy: 1.0

SVM training and evaluation
Best accuracy: 0.9864864864864865

K-Means training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.4755472634715879

Gaussian Mixture Model training and evaluation
Best accuracy: 0.8763616256967978

KNN training and evaluation
Best accuracy: 0.9864864864864865



**ecoli**

In [47]:
#ecoli
dataset_name = 'ecoli'
ecoli = fetch_ucirepo(id=39) 
X = ecoli.data.features 
y = ecoli.data.targets 

categorical_columns = ['lip', 'chg']

numeric_columns = ['mcg', 'gvh', 'aac', 'alm1', 'alm2']
clusters_max_num = 10
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.8970588235294118

RF training and evaluation
Best accuracy: 0.8823529411764706

SVM training and evaluation
Best accuracy: 0.8823529411764706

K-Means training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.47159210803967083

Gaussian Mixture Model training and evaluation
Best accuracy: 0.5084448215578848

KNN training and evaluation
Best accuracy: 0.8970588235294118



**glass_identification**

In [48]:
#glass_identification
dataset_name = 'glass_identification'
glass_identification = fetch_ucirepo(id=42) 
X = glass_identification.data.features 
y = glass_identification.data.targets 

categorical_columns = []

numeric_columns = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']
clusters_max_num = 8
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.627906976744186

RF training and evaluation
Best accuracy: 0.8604651162790697

SVM training and evaluation
Best accuracy: 0.7209302325581395

K-Means training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.2523140648128736

Gaussian Mixture Model training and evaluation
Best accuracy: 0.3331856446610545

KNN training and evaluation
Best accuracy: 0.6976744186046512



**letter_recognition**

In [49]:
#letter_recognition
dataset_name = 'letter_recognition'
letter_recognition = fetch_ucirepo(id=59) 
X = letter_recognition.data.features 
y = letter_recognition.data.targets 

categorical_columns = []

numeric_columns = ['x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar', 
    'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']
clusters_max_num = 28
X, X_train, X_test, y_train, y_test = preprocess_dataset(X,categorical_columns,numeric_columns)
train_and_evaluate(X_train,y_train,X_test,y_test,dataset_name,clusters_max_num)

Logistic regression training and evaluation
Best accuracy: 0.735

RF training and evaluation
Best accuracy: 0.85925

SVM training and evaluation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best accuracy: 0.95025

K-Means training and evaluation
Best accuracy: 0.003920121358068513

Gaussian Mixture Model training and evaluation
Best accuracy: 0.1841657482280568

KNN training and evaluation
Best accuracy: 0.94225



# Results

In [50]:
df = pd.DataFrame(results)
average_accuracy = df.groupby('Model').agg({'Accuracy': 'mean'}).reset_index()
ranking = average_accuracy.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

ranked_data = []
for model in ranking['Model']:
    model_data = df[df['Model'] == model]
    highest_accuracy = model_data.loc[model_data['Accuracy'].idxmax()]
    lowest_accuracy = model_data.loc[model_data['Accuracy'].idxmin()]
    ranked_data.append({
        'Model': model,
        'Average Accuracy': average_accuracy.loc[average_accuracy['Model'] == model, 'Accuracy'].values[0],
        'Highest Accuracy': highest_accuracy['Accuracy'],
        'Highest Accuracy Dataset': highest_accuracy['Dataset'],
        'Lowest Accuracy': lowest_accuracy['Accuracy'],
        'Lowest Accuracy Dataset': lowest_accuracy['Dataset'],
        'Number of Datasets': model_data['Dataset'].nunique()
    })


ranked_df = pd.DataFrame(ranked_data)

print("Ranking Table")
print(ranked_df)

Ranking Table
                 Model  Average Accuracy  Highest Accuracy  \
0        Random Forest          0.868552          1.000000   
1                  SVM          0.832357          1.000000   
2                  KNN          0.828920          1.000000   
3  Logistic Regression          0.812788          0.986486   
4                  GMM          0.263789          0.876362   
5              K-Means          0.215764          0.886512   

           Highest Accuracy Dataset  Lowest Accuracy  \
0                          balloons         0.576620   
1                              iris         0.500000   
2                              iris         0.512949   
3                       dermatology         0.500000   
4                       dermatology        -0.003037   
5  breast_cancer_wisconsin_original        -0.014204   

                    Lowest Accuracy Dataset  Number of Datasets  
0                                     adult                  21  
1                         