# This jupyter notebook is to implement all five optmized algorithms from two datasets and perform the comparison in terms of accuracy and time

## 1. Package installation and function defining

In [1]:
# basic packages
import numpy as np
import pandas as pd
import matplotlib
import time
import matplotlib.pyplot as plt
import warnings
from IPython.display import display
import IPython.display as ipd
import numpy

# sklearn packages
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, average_precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, validation_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.exceptions import ConvergenceWarning
from sklearn.neural_network import MLPClassifier



In [2]:
# plot setting fix
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)
matplotlib.rcParams.update({'font.size': 15})

# notification of completing the code

sound = []
def SoundNotification():
    global sound
    sr = 22050 # sample rate
    T = 90    # seconds
    t = numpy.linspace(0, T, int(T*sr), endpoint=False) # time variable
    x = 0.5*numpy.sin(2*numpy.pi*1000*t)              # pure sine wave at 440 Hz
    sound = ipd.Audio(x, rate=sr, autoplay=True) # load a NumPy array

    return sound

def done():    
    sound = SoundNotification()
    return sound

In [3]:
# cross validation test
def cvtest(train_X_set, train_y_set, selected_classifier, calculation, average='binary', fold=10, classifier_type = "DT_classifier"):
    data_size = len(train_X_set)
    train_accuracy, validation_accuracy = [], []
    num_sample = []
    grid_size = 700
    starting_size = 100
    for i in range(starting_size, data_size, grid_size):
        train_X, train_y = train_X_set[:i], train_y_set[: i]
        k_fold = StratifiedKFold(n_splits = fold, random_state = None, shuffle = False)
        training_list = []
        validation_list = []
        for train_index, test_index in k_fold.split(train_X, train_y):
            # get training and testin x, y data
            train_X_part, train_y_part, test_X_part, test_y_part = train_X.iloc[train_index], train_y.iloc[train_index], train_X.iloc[test_index], train_y.iloc[test_index]
            # use the classifier for training
            if classifier_type == "adaBoost" or classifier_type == "KNN" or classifier_type == "SVM" or classifier_type == "Neural Network":
                selected_classifier.fit(train_X_part.values, train_y_part.values.ravel()) # needed to be flattened
            else:
                selected_classifier.fit(train_X_part.values, train_y_part.values)
            # get the predicted results from training_y and testing_y
            train_y_prediction, test_y_prediction = selected_classifier.predict(train_X_part), selected_classifier.predict(test_X_part)
            # do different calculation of average
            training_list.append(calculation(train_y_part.values, train_y_prediction, average=average))
            validation_list.append(calculation(test_y_part.values, test_y_prediction, average=average))

        train_accuracy.append(np.mean(training_list))
        validation_accuracy.append(np.mean(validation_list))
        num_sample.append(i)
        
    return train_accuracy, validation_accuracy, num_sample

# plot the learning curve
def plot_lc(cv_study, title):
    num_sample = cv_study[2]
    new_x = [100 * x / num_sample[-1] for x in num_sample]
    plt.plot(new_x, cv_study[0], "r-x", linewidth=2.5, label="training")
    plt.plot(new_x, cv_study[1], "b-x", linewidth=2.5, label="validation")
    plt.title(title)
    plt.legend()
    plt.xlabel('Percentage of training samples (%)')
    plt.ylabel('Accuracy')
    plt.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2)
    plt.savefig("phishing_" + title + ".png")

        
def plot_optimization(training, testing, parameter, title, variable):
    plt.plot(parameter, training, "r-x", linewidth = 2.5, label = "training")
    plt.plot(parameter, testing, "b-x", linewidth = 2.5, label = "validation")
    plt.title(title)
    plt.legend()
    plt.ylabel('Accuracy')
    plt.xlabel(variable)
    plt.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2)
    plt.savefig("phising_" + title + ".png")


## 2. file loading and data pre-processing

In [4]:
# =================== diabetes =========================
# load file
diabetes_dataset = pd.read_csv("diabetes.csv")

# check if there's any NA for both files, if yes, then drop
diabetes_dataset.isnull().any().any()
diabetes_dataset.dropna()

# set the random seed as 56
np.random.seed(56)
print("diabetes information:", diabetes_dataset.info())

# remove unnecessary column
cleaned_diabetes_data = diabetes_dataset.drop(columns = ["PatientID"], inplace = False) # patientID is meaningless

# normalized the data
normalized = preprocessing.MinMaxScaler().fit_transform(cleaned_diabetes_data.values)
cleaned_diabetes_data = pd.DataFrame(normalized, columns = cleaned_diabetes_data.columns)
# print(cleaned_diabetes_data.shape)
cleaned_diabetes_data.head(n = 10)

# =================== phishing =========================
# load file
phishing_dataset = pd.read_csv("phishing.csv")

# check if there's any NA for both files, if yes, then drop
phishing_dataset.isnull().any().any()
phishing_dataset.dropna()

# set the random seed as 56
np.random.seed(56)
print("phishing information:", phishing_dataset.info())

# remove unnecessary column
cleaned_phishing_data = phishing_dataset.drop(columns = ["Index"], inplace = False) # patientID is meaningless

# normalized the data
normalized = preprocessing.MinMaxScaler().fit_transform(cleaned_phishing_data.values)
cleaned_phishing_data = pd.DataFrame(normalized, columns = cleaned_phishing_data.columns)
cleaned_phishing_data.head(n = 10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PatientID               15000 non-null  int64  
 1   Pregnancies             15000 non-null  int64  
 2   PlasmaGlucose           15000 non-null  int64  
 3   DiastolicBloodPressure  15000 non-null  int64  
 4   TricepsThickness        15000 non-null  int64  
 5   SerumInsulin            15000 non-null  int64  
 6   BMI                     15000 non-null  float64
 7   DiabetesPedigree        15000 non-null  float64
 8   Age                     15000 non-null  int64  
 9   Diabetic                15000 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 1.1 MB
diabetes information: None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11054 entries, 0 to 11053
Data columns (total 32 columns):
 #   Column               Non-Null Count  Dtype
---  ------          

Unnamed: 0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,1.0,1.0,1.0,1.0,1.0,0.0,0.5,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.5,0.0,1.0,1.0,1.0,0.0
1,1.0,0.5,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.5,0.0,0.0
2,1.0,0.5,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.5,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.5,0.0,1.0,1.0,1.0,1.0
4,0.0,0.5,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
5,1.0,0.5,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0
6,1.0,0.5,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.5,0.0,1.0,0.5,1.0,0.0
7,1.0,0.5,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.5,1.0,1.0
8,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.5,0.0,1.0,0.5,1.0,0.0
9,1.0,1.0,1.0,1.0,1.0,0.0,0.5,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


## 3. Selecting the training and testing data for diabetes dataset (80% - 20%)

In [5]:
### for diabetes ==============
diabetes_data_split = StratifiedShuffleSplit(n_splits = 1 , test_size = 0.2, random_state=56)

for train_index, test_index in diabetes_data_split.split(cleaned_diabetes_data, cleaned_diabetes_data["Diabetic"]):
    diabetes_training_set = cleaned_diabetes_data.loc[train_index]
    diabetes_testing_set = cleaned_diabetes_data.loc[test_index]
# trainng data    
diabetes_train_X = diabetes_training_set.copy(deep= True).drop("Diabetic", axis=1)
diabetes_train_y = diabetes_training_set.copy(deep= True)[["Diabetic"]]
# testing data
diabetes_test_X = diabetes_testing_set.copy(deep= True).drop("Diabetic", axis=1)
diabetes_test_y = diabetes_testing_set.copy(deep= True)[["Diabetic"]]
# get data
diabetes_train_X.info(), diabetes_train_y.info(), diabetes_test_X.info(), diabetes_test_y.info()

### for phishing ==============
phishing_data_split = StratifiedShuffleSplit(n_splits = 1 , test_size = 0.2, random_state=56)

for train_index, test_index in phishing_data_split.split(cleaned_phishing_data, cleaned_phishing_data["class"]):
    phishing_training_set = cleaned_phishing_data.loc[train_index]
    phishing_testing_set = cleaned_phishing_data.loc[test_index]
# trainng data    
phishing_train_X = phishing_training_set.copy(deep= True).drop("class", axis=1)
phishing_train_y = phishing_training_set.copy(deep= True)[["class"]]
# testing data
phishing_test_X = phishing_testing_set.copy(deep= True).drop("class", axis=1)
phishing_test_y = phishing_testing_set.copy(deep= True)[["class"]]
# get data
phishing_train_X.info(), phishing_train_y.info(), phishing_test_X.info(), phishing_test_y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 14204 to 9954
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Pregnancies             12000 non-null  float64
 1   PlasmaGlucose           12000 non-null  float64
 2   DiastolicBloodPressure  12000 non-null  float64
 3   TricepsThickness        12000 non-null  float64
 4   SerumInsulin            12000 non-null  float64
 5   BMI                     12000 non-null  float64
 6   DiabetesPedigree        12000 non-null  float64
 7   Age                     12000 non-null  float64
dtypes: float64(8)
memory usage: 843.8 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 14204 to 9954
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Diabetic  12000 non-null  float64
dtypes: float64(1)
memory usage: 187.5 KB
<class 'pandas.core.frame.DataFrame'>
Int64Inde

(None, None, None, None)

## 4. Algorithm implentment and optimization

### 4-1 Decision Tree

#### 4-1-1 diabetes

#### 4-1-2 phishing

#### 4-1-3 comparison

### 4-2 AdaBoost

#### 4-2-1 diabetes

#### 4-2-2 phishing

#### 4-3-3 comparison

### 4-3 KNN

#### 4-3-1 diabetes

#### 4-3-2 phishing

#### 4-3-3 comparison

### 4-4 SVM

#### 4-4-1 diabetes

#### 4-4-2 phishing

#### 4-4-3 comparison

### 4-5 Neural Network

#### 4-5-1 diabetes

#### 4-5-2 phishing

#### 4-5-3 comparison