<a href="https://colab.research.google.com/github/ikbalsingh/cs171/blob/main/KNN-SVM-RandomForest-IRIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import os
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

class IrisTester:
    def __init__(self):
        # Call the initializeDataSet helper method to set up the training and test data
        self.X_train, self.X_test, self.y_train, self.y_test = self.initializeDataSet()

    def initializeDataSet(self):
        iris = datasets.load_iris()
        dataSet = pd.DataFrame({
            'sepal length': iris.data[:, 0],
            'sepal width': iris.data[:, 1],
            'petal length': iris.data[:, 2],
            'petal width': iris.data[:, 3],
            'species': iris.target
        })

        X = dataSet[['sepal length', 'sepal width', 'petal length', 'petal width']]  # Features
        y = dataSet['species']  # Target (labels: 0, 1, or 2)

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # 80% training, 20% test
        return X_train, X_test, y_train, y_test


    def SVM(self):
        # hyperparameter grid for SVM
        parameters = {
            'C': [int(x) for x in np.linspace(start=5, stop=10, num=5)],  # Test values for C
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel type
        }

        # Create an SVM classifier
        model = svm.SVC(probability=True)

        # Use grid search cross-validation to find the best hyperparameters
        clf = GridSearchCV(model, parameters, cv=5, scoring=['accuracy', 'f1_macro'], refit='accuracy')
        clf.fit(self.X_train, self.y_train)

        print("SVM RESULTS")
        # Print the best parameters and scores
        print("Best fit params:", clf.best_params_, ", Best Score:", clf.best_score_)
        print("Best fit cv accuracy:", clf.best_score_)

        #print accuracy and F1-score
        predictions = clf.predict(self.X_test)
        print("Test Accuracy", accuracy_score(y_true=self.y_test, y_pred=predictions))
        print("F1-Score", f1_score(y_true=self.y_test, y_pred=predictions, average='macro'))

         # Calculate ROC AUC on test set
        roc_auc = roc_auc_score(self.y_test, clf.predict_proba(self.X_test), multi_class='ovr', average='macro')
        print("ROC AUC:", roc_auc)
        print()

    def randomForest(self):
        # hyperparameter grid for Random Forest
        parameters = {
            'max_depth': [int(x) for x in np.linspace(start=2, stop=4, num=3)],
            'criterion' : ['entropy'],
            'n_estimators': [int(x) for x in np.linspace(start=100, stop=150, num=10)]
            }

        # Create a Random Forset classifier
        model = RandomForestClassifier(random_state=0)

        # Use grid search cross validation to find the best parameters
        clf = GridSearchCV(model, parameters, cv=5, scoring=['accuracy', 'f1_macro'],refit='accuracy')
        clf.fit(self.X_train, self.y_train)

        print("RANDOM FOREST RESULTS")
        print("Best fit params:", clf.best_params_, ", Best Score:", clf.best_score_)
        print("Best fit cv accuracy:", clf.best_score_)

        predictions = clf.predict(self.X_test)
        print("Test Accuracy" , accuracy_score(y_true=self.y_test, y_pred=predictions) )
        print("F1-Score", f1_score(y_true=self.y_test, y_pred=predictions, average='macro'))

        # Calculate ROC AUC on test set
        roc_auc = roc_auc_score(self.y_test, clf.predict_proba(self.X_test), multi_class='ovr', average='macro')
        print("ROC AUC:", roc_auc)
        print()

    def kNeighbors(self):
        # hyperparameter grid for Random Forest
        parameters = {
            'n_neighbors': [int(x) for x in np.linspace(start=5, stop=10, num=5)],
            'weights' : ['uniform', 'distance'],
            'p': [1,2]
            }

        # Create a Random Forset classifier
        model = KNeighborsClassifier()

        # Use grid search cross validation to find the best parameters
        clf = GridSearchCV(model, parameters, cv=5, scoring=['accuracy', 'f1_macro'],refit='accuracy')
        clf.fit(self.X_train, self.y_train)

        print("K-NEAREST NEIGHBORS RESULTS")
        print("Best fit params:", clf.best_params_, ", Best Score:", clf.best_score_)
        print("Best fit cv accuracy:", clf.best_score_)

        predictions = clf.predict(self.X_test)
        print("Test Accuracy" , accuracy_score(y_true=self.y_test, y_pred=predictions) )
        print("F1-Score", f1_score(y_true=self.y_test, y_pred=predictions, average='macro'))

        # Calculate ROC AUC on test set
        roc_auc = roc_auc_score(self.y_test, clf.predict_proba(self.X_test), multi_class='ovr', average='macro')
        print("ROC AUC:", roc_auc)
        print()


##Data Analysis
data = datasets.load_iris()
#description of dataset
print("Description of dataset")
print(data["DESCR"])
print()

##TESTING BLOCK
test = IrisTester()
test.SVM()
test.randomForest()
test.kNeighbors()



Description of dataset
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU