<a href="https://colab.research.google.com/github/jpedrocf/MLStudies/blob/main/AccuracyTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


# Imports and Dataset Load

In [35]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# fetch dataset
wine = fetch_ucirepo(id=109)

# data (as pandas dataframes)
X = wine.data.features
y = wine.data.targets

y = y.values.ravel()
# metadata
print("Metadata = ", wine.metadata)

# variable information
print(wine.variables)



Metadata =  {'uci_id': 109, 'name': 'Wine', 'repository_url': 'https://archive.ics.uci.edu/dataset/109/wine', 'data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv', 'abstract': 'Using chemical analysis to determine the origin of wines', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 178, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1992, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C5PC7J', 'creators': ['Stefan Aeberhard', 'M. Forina'], 'intro_paper': {'ID': 246, 'type': 'NATIVE', 'title': 'Comparative analysis of statistical pattern recognition methods in high dimensional settings', 'authors': 'S. Aeberhard, D. Coomans, O. Vel', 'venue': 'Pattern Recognition', 'year': 1994, 'journal': None, 'DOI': '10.1016/0031-3203(94)90145-7', 'U

# Splitting the data into training and testing sets (80% training, 20% testing)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=40, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 142
Testing set size: 36


# Accuracy Evaluation with KNN, Decison Tree, Naive Bayes and SVM (using default settings)

In [19]:
# Initialize the models with default parameters
knn = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier()
naive_bayes = GaussianNB()
svm = SVC()

# Train each model on the training set
knn.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
naive_bayes.fit(X_train, y_train)
svm.fit(X_train, y_train)

# Predict on the test set
knn_pred = knn.predict(X_test)
dt_pred = decision_tree.predict(X_test)
nb_pred = naive_bayes.predict(X_test)
svm_pred = svm.predict(X_test)

# Calculate accuracy for each model
knn_accuracy = accuracy_score(y_test, knn_pred)
dt_accuracy = accuracy_score(y_test, dt_pred)
nb_accuracy = accuracy_score(y_test, nb_pred)
svm_accuracy = accuracy_score(y_test, svm_pred)

# Print the accuracy for each model
print(f"KNN Accuracy: {knn_accuracy:.4f}")
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")
print(f"SVM Accuracy: {svm_accuracy:.4f}")

KNN Accuracy: 0.6944
Decision Tree Accuracy: 0.8611
Naive Bayes Accuracy: 0.9722
SVM Accuracy: 0.6944


## Hyperparameter Tuning (KNN)



In [24]:
# Define the parameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski', 'cosine', 'hamming', 'chebyshev']
}

# Initialize KNN model
knn = KNeighborsClassifier()

# Perform grid search with cross-validation
grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=5, scoring='accuracy')

# Fit the grid search
grid_search_knn.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters for KNN:", grid_search_knn.best_params_)
print("Best cross-validation score for KNN:", grid_search_knn.best_score_)

# Test the best KNN model on the test set
best_knn = grid_search_knn.best_estimator_
knn_pred = best_knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print(f"Best KNN model accuracy on test set: {knn_accuracy:.4f}")

Best parameters for KNN: {'metric': 'cosine', 'n_neighbors': 7, 'weights': 'distance'}
Best cross-validation score for KNN: 0.8172413793103448
Best KNN model accuracy on test set: 0.8333


In [27]:
knn = KNeighborsClassifier(metric='cosine', n_neighbors=7, weights='distance')
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print(f"KNN Accuracy (after Tuning): {knn_accuracy:.4f}")

KNN Accuracy (after Tuning): 0.8333


## Hyperparameter Tuning (SVM)

In [31]:
# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

# Initialize SVM model
svm = SVC()

# Perform grid search with cross-validation
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, scoring='accuracy')

# Fit the grid search
grid_search_svm.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters for SVM:", grid_search_svm.best_params_)
print("Best cross-validation score for SVM:", grid_search_svm.best_score_)

# Test the best SVM model on the test set
best_svm = grid_search_svm.best_estimator_
svm_pred = best_svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"Best SVM model accuracy on test set: {svm_accuracy:.4f}")

Best parameters for SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation score for SVM: 0.9716748768472907
Best SVM model accuracy on test set: 0.9167


In [37]:
svm = SVC(C= 1, gamma= 'scale', kernel= 'linear')
svm.fit(X_train, y_train)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Accuracy (after Tuning): {svm_accuracy:.4f}")

SVM Accuracy (after Tuning): 0.9167


## Accuracy comparison after tuning

In [34]:
# Print the accuracy for each model after tuning
print(f"KNN Accuracy: {knn_accuracy:.4f}")
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")
print(f"SVM Accuracy: {svm_accuracy:.4f}")

KNN Accuracy: 0.8333
Decision Tree Accuracy: 0.8611
Naive Bayes Accuracy: 0.9722
SVM Accuracy: 0.9167
