In [None]:
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as pyplot
import io
import os, shutil
import pandas as pd
import csv
from scipy.stats import bootstrap
import sys
import seaborn as sns
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.feature_selection import RFE
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [None]:
#Extract the time-domain features from dataset, one dataset one instance
#input:directory of data folder
#return:instances from dataset
#columns:features row:instance
def instance(dir):
    #create columns title
    #example "min2"
    column=list()
    for i in range(1,7):
            column.append("min"+str(i))
            column.append("max"+str(i))
            column.append("mean"+str(i))
            column.append("median"+str(i))
            column.append("standard_dev"+str(i))
            column.append("first_quar"+str(i))
            column.append("third_quar"+str(i))
    total=pd.DataFrame()
    # print(os.listdir(dir))
    #read files in folder and compute features
    for f in os.listdir(dir):
        if not (f.endswith('DS_Store')):
            features=list()
            data = pd.read_csv(os.path.join(dir,f), skiprows=4)
            # print("\n",f)
            summary_table=data.describe().transpose()
            summary_table = summary_table.rename(columns={
            '50%': 'median',
            'min': 'min',
            'max': 'max',
            '25%': 'first_quar',
            '75%': 'third_quar'})
            # print(summary_table)
            #reorganize features to a single row
            for i in range(1,7):
                for j in ["min","max","mean","median","std","first_quar","third_quar"]:
                    features.append(summary_table[j][i])
            # print("shape",np.shape(features))
            total=total.append([features],ignore_index=True)
            # print(total)
    #rename columns
    total.columns=column
    return total

#call instance method for folders in data
#instances is a dataframe contains 88 instances
#input:dir_data:path of data folder
#return:a dataFrame contains instances
#columns:features row:instance
def sum_instance(dir_data):
    instances=pd.DataFrame()
    print(os.listdir(dir_data))
    for f in os.listdir(dir_data):
        if not (f.endswith('DS_Store')):
            print(os.path.join(dir_data,f))      
            instances=pd.concat([instances,instance(os.path.join(dir_data,f))])
    return instances

#call instance_breakf method for folders in data
#instances is a dataframe contains 88 instances
#input:dir_data:path of data folder
#return:a dataFrame contains instances
#columns:features row:instance
def sum_instancef(dir_data,piec):
    instances=pd.DataFrame()
    
    for f in os.listdir(dir_data):
        if not (f.endswith('DS_Store')):
                 
            instances=pd.concat([instances,instance_breakf(os.path.join(dir_data,f),piec)])
    return instances

#Break each time series in your training set into two (approximately) equal length time series.
#Extract the time-domain features from dataset, one dataset one instance
#input:directory of data folder
#piec:break each series into pieces. Example: piec=2 break each series into two equal length series. more features generated
#return:instances from dataset
#columns:true label + features 
#row:instance
def instance_breakf(dir,piec):
    #create columns title
    #example "min2"
    #index for new generate features start from 7
    #number of coulmns=piec*number of time series+1
    column=list()
    column.append("true_label")
    for i in range(1,piec*6+1):
            column.append("min"+str(i))
            column.append("max"+str(i))
            column.append("mean"+str(i))
            column.append("median"+str(i))
            column.append("standard_dev"+str(i))
            column.append("first_quar"+str(i))
            column.append("third_quar"+str(i))

    #all instances
    total=pd.DataFrame()
    # print(os.listdir(dir))
    #read files in folder and compute features
    for f in os.listdir(dir):
        if not (f.endswith('DS_Store')):
            #features for each instance
            temp=list()
            
            data = pd.read_csv(os.path.join(dir,f), skiprows=4)
            #break data into pieces and extract features from each pieces
            for p in np.array_split(data,piec):
                # print(p)
                features=list()
                # print("\n",f)
                summary_table=p.describe().transpose()
                summary_table = summary_table.rename(columns={
                '50%': 'median',
                'min': 'min',
                'max': 'max',
                '25%': 'first_quar',
                '75%': 'third_quar'})
                # print(summary_table)
                #reorganize features to a single row
                for i in range(1,7):
                    for j in ["min","max","mean","median","std","first_quar","third_quar"]:
                        features.append(summary_table[j][i])
                # print("shape",np.shape(features))
                #concatenate two lists, breaking series provide more features instead of instance
                temp=temp+features
                # print(temp)
            total=total.append([[dir]+temp],ignore_index=True)
                # print(total)
    #rename columns
    total.columns=column
    return total

#Break each time series in your training set into two (approximately) equal length time series.
#Extract the time-domain features from dataset, one dataset one instance
#input:directory of data folder
#piec:break each series into pieces. Example: piec=2 break each series into two equal length series. more instance generated
#return:instances from dataset
#columns:features row:instance
def instance_break(dir,piec):
    #create columns title
    #example "min2"
    column=list()
    for i in range(1,7):
            column.append("min"+str(i))
            column.append("max"+str(i))
            column.append("mean"+str(i))
            column.append("median"+str(i))
            column.append("standard_dev"+str(i))
            column.append("first_quar"+str(i))
            column.append("third_quar"+str(i))
    #all instances
    total=pd.DataFrame()
    # print(os.listdir(dir))
    #read files in folder and compute features
    for f in os.listdir(dir):
        if not (f.endswith('DS_Store')):
            data = pd.read_csv(os.path.join(dir,f), skiprows=4)
            #break data into pieces and extract features from each pieces
            for p in np.array_split(data,piec):
                # print(p)
                features=list()
                # print("\n",f)
                summary_table=p.describe().transpose()
                summary_table = summary_table.rename(columns={
                '50%': 'median',
                'min': 'min',
                'max': 'max',
                '25%': 'first_quar',
                '75%': 'third_quar'})
                # print(summary_table)
                #reorganize features to a single row
                for i in range(1,7):
                    for j in ["min","max","mean","median","std","first_quar","third_quar"]:
                        features.append(summary_table[j][i])
                # print("shape",np.shape(features))
                # print(temp)
                total=total.append([features],ignore_index=True)
            
                # print(total)
    #rename columns
    total.columns=column
    return total


#summarize min, mean, max of series 1,2,6 in bending activaties and other activities
#input func:function used to extract features. func=instance or instance_break
#input p: parameter of function instance_break() piec
#output bending: a dataframe contains min, mean, max of series 1,2,6 in bending activaties
#output other: a dataframe contains min, mean, max of series 1,2,6 in other activaties
def fea_scaplot(func,p):
    bending=pd.DataFrame()
    other=pd.DataFrame()
    #summarize min, mean, max of series 1,2,6 in bending activaties
    for k in ["bending1","bending2"]:
        temp=pd.DataFrame()
        if func==instance:
            temp_instance=func("../data/AReM/"+k)
        elif func==instance_break:
            temp_instance=func("../data/AReM/"+k,p)
        for i in ["min","mean","max"]:
            for j in ["1","2","6"]:
                #concat columns
                temp=pd.concat([temp,temp_instance[i+j]],axis=1)   
        #concat rows        
        bending=pd.concat([bending,temp],axis=0,ignore_index=True) 
    #summarize min, mean, max of series 1,2,6 in other activaties
    for k in ["cycling","lying","sitting","standing","walking"]:
        temp=pd.DataFrame()
        if func==instance:
            temp_instance=func("../data/AReM/"+k)
        elif func==instance_break:
            temp_instance=func("../data/AReM/"+k,p)
        for i in ["min","mean","max"]:
            for j in ["1","2","6"]:
                #concat columns
                temp=pd.concat([temp,temp_instance[i+j]],axis=1)   
        #concat rows        
        other=pd.concat([other,temp],axis=0,ignore_index=True)
    bending.sort_index()
    other.sort_index()
    return bending,other

#find the value of (l,p)
#Break each time series in your training set into l pieces
#p is the number of features used in recursive feature elimination
#input piec: l
#output rfecv.n_features_: best value of p
#output max(rfecv.cv_results_["mean_test_score"]): maximum of test accuracy
def model_selec(piec):
    temp=sum_instancef("../data/AReM",piec)
    data_train=temp.iloc[:,1:]
    data_label=list()
    for i in temp.iloc[:,0]:
        if (i=="../data/AReM/bending1")|(i=="../data/AReM/bending2"):
            data_label.append(1)
        else:
            data_label.append(0)
    # Minimum number of features to consider
    min_features_to_select = 1
    clf = LogisticRegression()
    cv = StratifiedKFold(5)
    rfecv = RFECV(
        estimator=clf,
        step=1,
        cv=cv,
        scoring="precision",
        min_features_to_select=min_features_to_select,
        n_jobs=-1,
    )
    rfecv.fit(data_train,data_label)
    
    return rfecv.n_features_,max(rfecv.cv_results_["mean_test_score"]),rfecv
#find the value of (l,p)
#Break each time series in your training set into l pieces
#p is the number of features used in recursive feature elimination
#input piec: l
#output rfecv.n_features_: best value of p
#output max(rfecv.cv_results_["mean_test_score"]): maximum of test accuracy
def model_selecL1(piec):
    temp=sum_instancef("../data/AReM",piec)
    data_train=temp.iloc[:,1:]
    data_label=list()
    for i in temp.iloc[:,0]:
        if (i=="../data/AReM/bending1")|(i=="../data/AReM/bending2"):
            data_label.append(1)
        else:
            data_label.append(0)
    # Minimum number of features to consider
    min_features_to_select = 1
    clf = LogisticRegression(penalty="l1",solver="saga")
    cv = StratifiedKFold(5)
    rfecv = RFECV(
        estimator=clf,
        step=1,
        cv=cv,
        scoring="accuracy",
        min_features_to_select=min_features_to_select,
        n_jobs=-1,
    )
    rfecv.fit(data_train,data_label)
    return rfecv.n_features_,max(rfecv.cv_results_["mean_test_score"]),rfecv

#find the value of (l,p)
#Break each time series in your training set into l pieces
#p is the number of features used in recursive feature elimination
#input piec: l
#output rfecv.n_features_: best value of p
#output max(rfecv.cv_results_["mean_test_score"]): maximum of test accuracy
#output rfecv:
def model_selecL1_multi(piec):
    temp=sum_instancef("../data/AReM",piec)
    data_train=temp.iloc[:,1:]
    data_label=list()
    le = preprocessing.LabelEncoder()
    data_label=le.fit_transform(temp.iloc[:,0])
    min_features_to_select = 1
    clf = LogisticRegression(penalty="l1",solver="saga",multi_class="multinomial")
    cv = StratifiedKFold(5)
    rfecv = RFECV(
        estimator=clf,
        step=1,
        cv=cv,
        scoring="accuracy",
        min_features_to_select=min_features_to_select,
        n_jobs=-1,
    )
    rfecv.fit(data_train,data_label)
    return rfecv.n_features_,max(rfecv.cv_results_["mean_test_score"]),rfecv

#Create p Principal Components
def apply_pca(X, n_components):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca
#Cross validate on the (l, p) pair to build a Na ̈ıve Bayes’ classifier based on the PCA features
#output best_l
#output best_p
def model_naive_pca():
    best_accuracy=0
    estimator=GaussianNB()
    for l in range(1, 21):
        temp=sum_instancef("../data/AReM",l)
        data_train=temp.iloc[:,1:]
        data_label=list()
        le = preprocessing.LabelEncoder()
        data_label=le.fit_transform(temp.iloc[:,0])
        
        #build test data and label
        temp=sum_instancef("../data/data_testing",l)
        data_test=temp.iloc[:,1:]
        data_label_test=list()
        le = preprocessing.LabelEncoder()
        data_label_test=le.fit_transform(temp.iloc[:,0])
        for p in range(1, 20):
            data_train_pca=apply_pca(data_train,p)
            data_test_pca=apply_pca(data_test,p)
            estimator.fit(data_train_pca,data_label)
            y_pred=estimator.predict(data_test_pca)
            accuracy=accuracy_score(data_label_test, y_pred)
            if accuracy> best_accuracy:
                best_accuracy = accuracy
                best_l = l
                best_p = p
                y_pred_best=y_pred
        
    return best_l,best_p,best_accuracy,y_pred_best

(c)ii new dataset

In [None]:
print("shape",np.shape(sum_instance("../data/AReM")))
data_train=sum_instance("../data/AReM")
print("shape",np.shape(sum_instance("../data/data_testing")))
data_test=sum_instance("../data/data_testing")
instances=pd.concat([sum_instance("../data/AReM"),sum_instance("../data/data_testing")])

            

In [None]:
#new dataset
instances

(c)iii

In [None]:
#standard deviation of features
np.std(instances,axis=0)

In [None]:
#confidence interval of for the standard deviation of each feature
res=bootstrap([instances],np.std,confidence_level=0.9)
print("shape of confidence interval",np.shape(res.confidence_interval))
res.confidence_interval

(c)iv

choose min, mean, and max

(c) v

In [None]:

#plot features 
bending=fea_scaplot(instance,2)[0]
other=fea_scaplot(instance,2)[1]
for i in ["min","mean","max"]:
        pyplot.figure()
        for j in ["1","2","6"]:
            pyplot.scatter(x=bending.index,y=bending[i+j],color='r') 
            pyplot.scatter(x=other.index,y=other[i+j],color='b')  
        pyplot.title(i)    
        pyplot.legend(["bending","other"])
        pyplot.show()
   


Part 2 Binary and Multiclass Classification
(a)i

break each time series and plot scatter plots

Explanation: Compared to the result from 3(c)v, breaking time series provides high-variance samples

In [None]:
#plot features 
bending=fea_scaplot(instance,2)[0]
other=fea_scaplot(instance,2)[1]
bending_break=fea_scaplot(instance_break,2)[0]
other_break=fea_scaplot(instance_break,2)[1]
for i in ["min","mean","max"]:
        pyplot.subplot(1,2,1)
        for j in ["1","2","6"]:
            pyplot.scatter(x=bending.index,y=bending[i+j],color='r') 
            pyplot.scatter(x=other.index,y=other[i+j],color='b')  
        pyplot.title(i+"nonbreak")
        pyplot.subplot(1,2,2)
        for j in ["1","2","6"]:
            pyplot.scatter(x=bending_break.index,y=bending_break[i+j],color='r') 
            pyplot.scatter(x=other_break.index,y=other_break[i+j],color='b')  
        pyplot.title(i+"break")    
        pyplot.legend(["bending","other"])
        pyplot.show()


Part 2 Binary and Multiclass Classification
(a)ii

The right way to perform cross-validation:

1.break trainning set into l time series

2.divide trainning set into k folds

3.fit model with k-1 folds data

4.compute p-value of parameters

5.eliminate features

6.test model with k fold

7.go back to step 3

8 average the test error



In [None]:
#find the best value of l and p
accuracy=list()
p=list()
for i in range(1,21):
    temp=model_selec(i)
    accuracy.append(temp[1])
    p.append(temp[0])

In [None]:
#plot the precision and p value
pyplot.subplot(2,1,1)
pyplot.plot(accuracy)
pyplot.title("accuracy")
pyplot.subplot(2,1,2)
pyplot.title("p_value")
pyplot.plot(p)
print("precision of train data",max(accuracy))
print(p)
print(accuracy)


the best value of l is 18, and corresponding p is 1
(18,1)

Part 2 Binary and Multiclass Classification
(a)iii

In [None]:
#build train data and label
temp=sum_instancef("../data/AReM",18)
data_train=temp.iloc[:,1:]
data_label=list()
for i in temp.iloc[:,0]:
    if (i=="../data/AReM/bending1")|(i=="../data/AReM/bending2"):
        data_label.append(1)
    else:
        data_label.append(0)
#model with best value of l and p
model=model_selec(18)[2]

In [None]:
#prediction on trainning data
y_pred=model.predict(data_train)
accuracy=1-sum(abs(y_pred-data_label))/len(y_pred)

model.support_
#selected feature
select=data_train.columns[model.support_]
#use selected feature to fit logit model and compute the p-value of coefficient
X_train_with_constant = sm.add_constant(data_train[select])
logit_model = sm.Logit(data_label, X_train_with_constant)
logit_results = logit_model.fit()
p_values = logit_results.pvalues
#comfusion mattrix
cm = confusion_matrix(data_label, y_pred)
sns.heatmap(cm, annot=True, cmap='Blues')
# Get the predicted probabilities for positive class from the classifier
y_train_pred_proba = model.predict_proba(data_train)[:, 1]
# Compute the false positive rate, true positive rate, and thresholds for the ROC curve
fpr, tpr, thresholds = roc_curve(data_label, y_train_pred_proba)
# Calculate the AUC (Area Under the Curve)
auc_score = auc(fpr, tpr)


In [None]:
print("accuracy train",accuracy)
print("selected features:\n",select)
print("p value:\n",p_values)
print("parameter:\n",model.estimator_.coef_)
print("Confusion Matrix:")
print(cm)
pyplot.plot(fpr, tpr)
pyplot.title("ROC curve")
# Print the AUC score
print("AUC:", auc_score)

Part 2 Binary and Multiclass Classification
(a)iv 

Test the classifier on the test set

In [None]:
#build test data and label
temp=sum_instancef("../data/data_testing",18)
data_test=temp.iloc[:,1:]
data_label_test=list()
for i in temp.iloc[:,0]:
    if (i=="../data/data_testing/bending1")|(i=="../data/data_testing/bending2"):
        data_label_test.append(1)
    else:
        data_label_test.append(0)
#prediction on test data
y_pred_test=model.predict(data_test)
accuracy_test=1-sum(abs(y_pred_test-data_label_test))/len(y_pred_test)
print("accuracy test",accuracy_test)

compare to accuracy of train data(1), precision of test data is lower(0.9473684210526316)

Part 2 Binary and Multiclass Classification
(a)v

yes, most of positive class are at the begin of data. The rest of data are negative class

Part 2 Binary and Multiclass Classification
(a)vi

yes, there are 60 positive samples and 9 negative samples

In [None]:
ind_major = [i for i, value in enumerate(data_label) if value == 0]
ind_min = [i for i, value in enumerate(data_label) if value == 1]

majority_class = data_train.iloc[ind_major] 
minority_class = data_train.iloc[ind_min]
# Upsample the minority class to match the number of majority class samples
minority_upsampled = resample(minority_class, n_samples=len(majority_class), replace=True, random_state=42)
data_train_balanced = np.vstack((majority_class, minority_upsampled))
data_label_balanced = np.concatenate((np.zeros(len(majority_class)), np.ones(len(minority_upsampled))))
print(np.shape(data_train_balanced))
model.fit(data_train_balanced,data_label_balanced)

In [None]:
y_pred=model.predict(data_train_balanced)
accuracy=1-sum(abs(y_pred-data_label_balanced))/len(data_label_balanced)
print("accuracy train",accuracy)
#comfusion mattrix
cm = confusion_matrix(data_label_balanced, y_pred)
# Get the predicted probabilities for positive class from the classifier
y_train_pred_proba = model.predict_proba(data_train_balanced)[:, 1]
# Compute the false positive rate, true positive rate, and thresholds for the ROC curve
fpr, tpr, thresholds = roc_curve(data_label_balanced, y_train_pred_proba)
# Calculate the AUC (Area Under the Curve)
auc_score = auc(fpr, tpr)
print("confusion mattrix\n",cm)
pyplot.figure(1)
sns.heatmap(cm, annot=True, cmap='Blues')
pyplot.figure(2)
pyplot.title("ROC curve")
pyplot.plot(fpr, tpr)
# Print the AUC score
print("AUC:", auc_score)

Part 2 Binary and Multiclass Classification
(b)

In [None]:
#find the best value of l and p
accuracy=list()
p=list()
for i in range(1,10):
    temp=model_selecL1(i)
    accuracy.append(temp[1])
    p.append(temp[0])
for i in range(10,21):
    temp=model_selecL1(i)
    accuracy.append(temp[1])
    p.append(temp[0])

In [None]:
#plot the precision and p value
pyplot.subplot(2,1,1)
pyplot.plot(accuracy)
pyplot.title("accuracy")
pyplot.subplot(2,1,2)
pyplot.title("number of features")
pyplot.plot(p)
print("precision of train data",max(accuracy))
print(p)
print(accuracy)

Compare with variable selection using p-values, l1-penalized performs better. The accuracy of l1-penalized(0.9714) is higher than that of variable selection using p-values(0.93) l1-penalty is easier to implement

Part 2 Binary and Multiclass Classification
(c)i

In [None]:
#find the best value of l and p
accuracy=list()
p=list()
for i in range(1,10):
    temp=model_selecL1_multi(i)
    accuracy.append(temp[1])
    p.append(temp[0])
for i in range(10,21):
    temp=model_selecL1_multi(i)
    accuracy.append(temp[1])
    p.append(temp[0])

In [None]:
#plot the precision and p value
pyplot.subplot(2,1,1)
pyplot.plot(accuracy)
pyplot.title("accuracy")
pyplot.subplot(2,1,2)
pyplot.title("number of features")
pyplot.plot(p)
pyplot.xlabel("L")
print("precision of train data",max(accuracy))
print(p)
print(accuracy)

choose l=1 p=29
test model on test dataset

In [None]:
model=model_selecL1_multi(1)[2]
model.n_features_


In [None]:
#build test data and label
temp=sum_instancef("../data/data_testing",1)
data_test=temp.iloc[:,1:]
data_label_test=list()
le = preprocessing.LabelEncoder()
data_label_test=le.fit_transform(temp.iloc[:,0])
#prediction on test data
y_pred_test=model.predict(data_test)
accuracy_test=1-sum(abs(y_pred_test-data_label_test))/len(y_pred_test)
print("accuracy test",accuracy_test)
print(y_pred_test)

In [None]:
#comfusion mattrix
cm = confusion_matrix(data_label_test, y_pred_test)
# Get the predicted probabilities for positive class from the classifier
y_train_pred_proba = model.predict_proba(data_test)[:, 1]
# Compute the false positive rate, true positive rate, and thresholds for the ROC curve
# fpr, tpr, thresholds = roc_curve(data_label_test, y_train_pred_proba)
# Calculate the AUC (Area Under the Curve)
# auc_score = auc(fpr, tpr)
print("confusion mattrix\n",cm)
# pyplot.plot(fpr, tpr)
# pyplot.title("ROC curve")
pyplot.figure(1)
sns.heatmap(cm, annot=True, cmap='Blues')

Part 2 Binary and Multiclass Classification
(c)ii

In [None]:
#test Gaussian pdf naive bayes classifier
temp=sum_instancef("../data/AReM",1)
data_train=temp.iloc[:,1:]
data_label=list()
le = preprocessing.LabelEncoder()
data_label=le.fit_transform(temp.iloc[:,0])
model.transform(data_train)
estimator=GaussianNB()
estimatormul=MultinomialNB()
estimator.fit(model.transform(data_train),data_label)
estimatormul.fit(model.transform(data_train),data_label)


In [None]:
#prediction on test data
y_pred_test=estimator.predict(model.transform(data_test))
accuracy_test=1-sum(abs(y_pred_test-data_label_test))/len(y_pred_test)
print("accuracy test",accuracy_test)
print(y_pred_test)
#comfusion mattrix
cm = confusion_matrix(data_label_test, y_pred_test)

print("confusion mattrix\n",cm)
#prediction on test data
y_pred_test=estimatormul.predict(model.transform(data_test))
accuracy_test=1-sum(abs(y_pred_test-data_label_test))/len(y_pred_test)
print("accuracy test",accuracy_test)
print(y_pred_test)
#comfusion mattrix
cm1 = confusion_matrix(data_label_test, y_pred_test)

print("confusion mattrix\n",cm1)

pyplot.subplot(1,2,1)
pyplot.title("Gaussian confusion")
sns.heatmap(cm, annot=True, cmap='Blues')
pyplot.subplot(1,2,2)
pyplot.title("Multinomial confusion")
sns.heatmap(cm1, annot=True, cmap='Blues')
pyplot.tight_layout()


Part 2 Binary and Multiclass Classification
(c)iii

Report test error and confusion matrix

In [None]:
best_l,best_p,best_accuracy,y_pred_best=model_naive_pca()
print("best_l,best_p,best_accuracy,y_pred_best\n",best_l,best_p,best_accuracy,y_pred_best)
error=1-best_accuracy
print("test error",error)
cm=confusion_matrix(data_label_test,y_pred_best)
sns.heatmap(cm, annot=True, cmap='Blues')

plot the scatterplot of the classes in your training data based on the first and second principal components

In [None]:

data_train_pca=apply_pca(data_train,2)
data_train_pca_12=data_train_pca[:,0:2]
# print(np.shape(np.array(data_train_pca_12)))
# print(np.shape(np.array(data_train_pca_12)[:,1]))
# print(np.shape(np.array(data_train_pca_12)[:,0]))
sns.scatterplot(x=data_train_pca_12[:,0],y=data_train_pca_12[:,1], hue=data_label, data=data_train_pca_12)
pyplot.xlabel("First Principal Component")
pyplot.ylabel("Second Principal Component")
pyplot.title("Scatterplot of Classes based on PCA")
pyplot.legend()
pyplot.show()

Part 2 Binary and Multiclass Classification
(c)iv

From my observation, logistic regression with l1 penalty perform well in multilabel classification. While naive bayes classifier perform well in binomial problem. Thuis is because dataset is imbalance in binomial problem and naive bayes is robust to imbalance dataset.