In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from time import time
from sklearn.metrics.pairwise import pairwise_kernels

In [2]:
# Load the dataset
data = arff.loadarff(r"path\to\dataset\file\Autism-Child-Data.arff")
df = pd.DataFrame(data[0])
df.head()
# df.iloc[:, 0].str.decode('utf-8')
# df.dtypes

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',...,b'm',b'Others',b'no',b'no',b'Jordan',b'no',5.0,b'4-11 years',b'Parent',b'NO'
1,b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'1',b'0',b'0',...,b'm',b'Middle Eastern ',b'no',b'no',b'Jordan',b'no',5.0,b'4-11 years',b'Parent',b'NO'
2,b'1',b'1',b'0',b'0',b'0',b'1',b'1',b'1',b'0',b'0',...,b'm',b'?',b'no',b'no',b'Jordan',b'yes',5.0,b'4-11 years',b'?',b'NO'
3,b'0',b'1',b'0',b'0',b'1',b'1',b'0',b'0',b'0',b'1',...,b'f',b'?',b'yes',b'no',b'Jordan',b'no',4.0,b'4-11 years',b'?',b'NO'
4,b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',b'1',...,b'm',b'Others',b'yes',b'no',b'United States',b'no',10.0,b'4-11 years',b'Parent',b'YES'


In [3]:
# Identify columns that need type conversion (from bytes to string)
bytes_columns = [column for column, dtype in df.dtypes.items() if dtype == 'object']

# Convert byte strings to normal strings (needed for Python 3)
for column in bytes_columns:
    df[column] = df[column].str.decode('utf-8')

# Split dataset into inputs and target
X = df.drop('Class/ASD', axis=1)  
y = df['Class/ASD'].values  


In [4]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5.0,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,Middle Eastern,no,no,Jordan,no,5.0,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,?,no,no,Jordan,yes,5.0,4-11 years,?,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,?,yes,no,Jordan,no,4.0,4-11 years,?,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,United States,no,10.0,4-11 years,Parent,YES


In [5]:
df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before',
       'result', 'age_desc', 'relation', 'Class/ASD'],
      dtype='object')

In [6]:
df = df[['age','gender','ethnicity','contry_of_res','jundice','austim','relation','used_app_before','age_desc','A1_Score','A2_Score','A3_Score','A4_Score','A5_Score',
                       'A6_Score','A7_Score','A8_Score','A9_Score','A10_Score','result','Class/ASD']]

In [7]:
df.head()

Unnamed: 0,age,gender,ethnicity,contry_of_res,jundice,austim,relation,used_app_before,age_desc,A1_Score,...,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,result,Class/ASD
0,6.0,m,Others,Jordan,no,no,Parent,no,4-11 years,1,...,0,0,1,1,0,1,0,0,5.0,NO
1,6.0,m,Middle Eastern,Jordan,no,no,Parent,no,4-11 years,1,...,0,0,1,1,0,1,0,0,5.0,NO
2,6.0,m,?,Jordan,no,no,?,yes,4-11 years,1,...,0,0,0,1,1,1,0,0,5.0,NO
3,5.0,f,?,Jordan,yes,no,?,no,4-11 years,0,...,0,0,1,1,0,0,0,1,4.0,NO
4,5.0,m,Others,United States,yes,no,Parent,no,4-11 years,1,...,1,1,1,1,1,1,1,1,10.0,YES


In [8]:
type(df.iloc[0][1])

str

In [9]:
# Define the preprocessor with transformations for both numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [14]:
from scipy.sparse import issparse

def normalized_poly_kernel(X, Y, c=1, d=2):
    # If X or Y is a sparse matrix, convert to a dense ndarray
    if issparse(X):
        X = X.toarray()
    if issparse(Y):
        Y = Y.toarray()

    # Compute the dot product between each pair of vectors in X and Y
    K = np.dot(X, Y.T)
    # Add the constant c to the dot products and raise it to the power d
    K = (K + c) ** d
    # Compute the sum of the dot products of vectors with themselves and add 1
    X_norm = np.dot(X, X.T).diagonal() 
    Y_norm = np.dot(Y, Y.T).diagonal() 
    # Normalize the kernel matrix
    K /= np.sqrt(np.outer(X_norm, Y_norm))
    
    return K

In [15]:
from scipy.spatial.distance import pdist, cdist
from scipy.spatial.distance import squareform
from scipy.sparse import issparse

def PUK_kernel(X1,X2, sigma=1.0, omega=1.0):
    # Compute the kernel matrix between two arrays using the Pearson VII function-based universal kernel.
    # Check if X1 or X2 are sparse matrices
    if issparse(X1):
        X1 = X1.toarray()
    if issparse(X2):
        X2 = X2.toarray()

    # Compute squared Euclidean distance between each row element pair of the two matrices
    if X1 is X2:
        kernel = squareform(pdist(X1, 'sqeuclidean'))
    else:
        kernel = cdist(X1, X2, 'sqeuclidean')

    kernel = (1 + (kernel * 4 * np.sqrt(2**(1.0/omega)-1)) / sigma**2) ** omega
    kernel = 1/kernel

    return kernel

In [19]:
import numpy as np
from scipy.spatial.distance import pdist, cdist
from scipy.spatial.distance import squareform

def my_laplacian_kernel(X, Y, sigma=1.0):
    if issparse(X):
        X = X.toarray()
    if issparse(Y):
        Y = Y.toarray()
    # Compute squared Euclidean distance between each row element pair of the two matrices
    if X is Y:
        kernel = squareform(pdist(X, 'cityblock'))
    else:
        kernel = cdist(X, Y, 'cityblock')

    res = np.exp(-kernel/sigma)
    return res

In [22]:
def Mahalanobis(X, Y, delta=1, m=20):
    if issparse(X):
        X = X.toarray()
    if issparse(Y):
        Y = Y.toarray()

    # Compute squared Euclidean distance between each row element pair of the two matrices
    if X is Y:
        kernel = squareform(pdist(X, 'mahalanobis'))
    else:
        kernel = cdist(X, Y, 'mahalanobis')
    return (-delta/m)*(kernel)**2

In [24]:
# Define SVM kernels
kernels = ['linear', 'poly', 'rbf', 'sigmoid', normalized_poly_kernel, PUK_kernel, my_laplacian_kernel]  # , 'Mahalanobis'

# Train models and evaluate using cross-validation

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
metrics = {'Kernel': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}
results = []
Confusion_matrix = {}

for kernel in kernels:
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    confusion_matrices = np.array([[0, 0], [0, 0]])
    # Create a pipeline that processes the data and then fits the model
    if callable(kernel):
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', SVC(kernel=kernel))])
    else:
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', SVC(kernel=kernel, gamma='scale'))])
    
    start_time = time()
    cv_metrics = {'Kernel': kernel, 'Correctly Predicted Instances': 0, 'Incorrectly Predicted Instances': 0, 'Error Rate (%)': 0}
    total_instances = 0
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Calculate metrics
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
        recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
        confusion_matrices += confusion_matrix(y_test, y_pred)
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        cv_metrics['Correctly Predicted Instances'] += np.trace(cm)
        cv_metrics['Incorrectly Predicted Instances'] += np.sum(cm) - np.trace(cm)
        total_instances += len(y_test)
        
    cv_metrics['Error Rate (%)'] = 100 * cv_metrics['Incorrectly Predicted Instances'] / total_instances
    cv_metrics['Overall Time Taken to Build the Model (in seconds)'] = time() - start_time
    results.append(cv_metrics)

    # Store the average of the metrics
    metrics['Kernel'].append(kernel)
    metrics['Accuracy'].append(np.mean(accuracy_scores))
    metrics['Precision'].append(np.mean(precision_scores))
    metrics['Recall'].append(np.mean(recall_scores))
    metrics['F1 Score'].append(np.mean(f1_scores))
    # metrics['Confusion Matrix'].append(confusion_matrices)
    Confusion_matrix[kernel] = confusion_matrices


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
# Display results
results_df = pd.DataFrame(results)

# Convert the metrics dictionary to a DataFrame for display
metrics_df = pd.DataFrame(metrics)

# Display styled DataFrames
print("Table 1: Performance Measures and Error Rate of Kernels for cross validation")
display(results_df[['Kernel', 'Correctly Predicted Instances', 'Incorrectly Predicted Instances', 'Error Rate (%)', 'Overall Time Taken to Build the Model (in seconds)']])

print("Table 2: Performance Measure Obtained Depending on Confusion Matrixes of Kernels:")
display(metrics_df[['Kernel', 'Accuracy', 'Precision', 'Recall', 'F1 Score']])

print("\nConfusion Matrices:")
for kernel, matrix in Confusion_matrix.items():
    print(f"Kernel: {kernel}")
    display(pd.DataFrame(matrix, columns=['Yes', 'No'], index=['Yes', 'No']))


Table 1: Performance Measures and Error Rate of Kernels for cross validation


Unnamed: 0,Kernel,Correctly Predicted Instances,Incorrectly Predicted Instances,Error Rate (%),Overall Time Taken to Build the Model (in seconds)
0,linear,292,0,0.0,0.625526
1,poly,290,2,0.684932,0.609484
2,rbf,284,8,2.739726,0.628633
3,sigmoid,285,7,2.39726,0.530823
4,<function normalized_poly_kernel at 0x000001F1...,291,1,0.342466,0.69928
5,<function PUK_kernel at 0x000001F1BE332D30>,282,10,3.424658,0.644888
6,<function my_laplacian_kernel at 0x000001F1BE4...,172,120,41.09589,0.587938


Table 2: Performance Measure Obtained Depending on Confusion Matrixes of Kernels:


Unnamed: 0,Kernel,Accuracy,Precision,Recall,F1 Score
0,linear,1.0,1.0,1.0,1.0
1,poly,0.993103,0.993549,0.993103,0.993095
2,rbf,0.972644,0.974721,0.972644,0.972581
3,sigmoid,0.976207,0.978061,0.976207,0.976135
4,<function normalized_poly_kernel at 0x000001F1...,0.996552,0.996782,0.996552,0.996552
5,<function PUK_kernel at 0x000001F1BE332D30>,0.965632,0.969606,0.965632,0.965355
6,<function my_laplacian_kernel at 0x000001F1BE4...,0.58908,0.669078,0.58908,0.490559



Confusion Matrices:
Kernel: linear


Unnamed: 0,Yes,No
Yes,151,0
No,0,141


Kernel: poly


Unnamed: 0,Yes,No
Yes,150,1
No,1,140


Kernel: rbf


Unnamed: 0,Yes,No
Yes,148,3
No,5,136


Kernel: sigmoid


Unnamed: 0,Yes,No
Yes,148,3
No,4,137


Kernel: <function normalized_poly_kernel at 0x000001F1BE3329D0>


Unnamed: 0,Yes,No
Yes,150,1
No,0,141


Kernel: <function PUK_kernel at 0x000001F1BE332D30>


Unnamed: 0,Yes,No
Yes,151,0
No,10,131


Kernel: <function my_laplacian_kernel at 0x000001F1BE440DC0>


Unnamed: 0,Yes,No
Yes,150,1
No,119,22
