In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from scipy import stats


from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **DATA UPLOAD**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/data/EEG-data.csv')


df_3 = df[(df["y"] == 3)]
df_3 = df_3.drop(['y', 'Unnamed: 0'], axis=1)

df_4 = df[(df["y"] == 4)]
df_4 = df_4.drop(['y', 'Unnamed: 0'], axis=1)

In [None]:
# Mean and standard deviation
df_3_mean = df_3.mean()
df_4_mean = df_4.mean()

df_3_sd = df_3.std()
df_4_sd = df_4.std()

# NORMALIZATION

In [None]:
# Step 2: Normalize the selected classes using z-score manually
df_3_normalized = (df_3 - df_3.mean()) / df_3.std()
df_4_normalized = (df_4 - df_4.mean()) / df_4.std()

In [None]:
train_ratio = 0.8
train_df_3, test_df_3 = train_test_split(df_3_normalized, train_size=train_ratio, random_state=42)

# Split df_4 into training and test sets
train_df_4, test_df_4 = train_test_split(df_4_normalized, train_size=train_ratio, random_state=42)

# Combine the training sets of df_3 and df_4
train_df_combined = pd.concat([train_df_3, train_df_4], axis=1)

# Combine the test sets of df_3 and df_4
test_df_combined = pd.concat([test_df_3, test_df_4],axis=1)



In [None]:
train_df_4['y'] = 1
train_df_3['y'] = 0

# Merge the datasets
result = [train_df_3, train_df_4]
df_combined = pd.concat(result)
df_combined

test_df_4['y'] = 1
test_df_3['y'] = 0
result_test = [test_df_3, test_df_4]


# DATA SPLIT

In [None]:

df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)#shuffle

X_train = df_combined.drop('y', axis=1)
y_train = df_combined['y']


x_test = pd.concat(result_test)
x_test = x_test.sample(frac=1, random_state=42).reset_index(drop=True) #shuffle

X_test=x_test.drop('y', axis=1)
y_test= x_test['y']

print(X_test.shape)

(40, 4094)


# TRAIN AND TEST

In [None]:
from sklearn.model_selection import GridSearchCV

'''# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  # Example range for C
    'gamma': [1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],  # Example range for gamma
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']  # Kernel type
}

# Create a SVC model instance
svm_model = SVC(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(svm_model, param_grid, refit=True, verbose=3, cv=5)  # cv=5 for 5-fold cross-validation

# Fit the model with data
grid_search.fit(X_train, y_train)

# Review the best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)'''

'# Define the parameter grid\nparam_grid = {\n    \'C\': [0.1, 1, 10, 100, 1000],  # Example range for C\n    \'gamma\': [1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],  # Example range for gamma\n    \'kernel\': [\'linear\', \'poly\', \'rbf\', \'sigmoid\']  # Kernel type\n}\n\n# Create a SVC model instance\nsvm_model = SVC(random_state=42)\n\n# Set up GridSearchCV\ngrid_search = GridSearchCV(svm_model, param_grid, refit=True, verbose=3, cv=5)  # cv=5 for 5-fold cross-validation\n\n# Fit the model with data\ngrid_search.fit(X_train, y_train)\n\n# Review the best parameters and score\nprint("Best parameters found:", grid_search.best_params_)\nprint("Best cross-validation score:", grid_search.best_score_)'

In [None]:
#Best parameters found:  OrderedDict([('C', 1000000.0), ('gamma', 0.000570349311114605), ('kernel', 'rbf')])
svm_classifier = svm.SVC(kernel='rbf', gamma = 0.0005, random_state=42)
svm_classifier.fit(X_train, y_train)

#  StratifiedKFold for cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True)

cross_val_scores = cross_val_score(svm_classifier, X_train, y_train, cv=kfold, scoring='accuracy')
print("Cross-validation scores:", cross_val_scores)

# Print the scores for each fold
for i, score in enumerate(cross_val_scores, 1):
    print(f"Fold {i}: Accuracy = {score}")

# Calculate and print the mean and standard deviation of the cross-validation scores
mean_score = np.mean(cross_val_scores)
std_dev = np.std(cross_val_scores)
print(f"Mean accuracy: {mean_score}")
print(f"Standard Deviation: {std_dev}")


y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)

print("Sensitivity:", recall)

# Calculating specificity from the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
TN = conf_matrix[0][0]
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
specificity = TN / (TN + FP)
print(f"Specificity: {specificity}")

Cross-validation scores: [0.6875 0.625  0.5625 0.75   0.625 ]
Fold 1: Accuracy = 0.6875
Fold 2: Accuracy = 0.625
Fold 3: Accuracy = 0.5625
Fold 4: Accuracy = 0.75
Fold 5: Accuracy = 0.625
Mean accuracy: 0.65
Standard Deviation: 0.0637377439199098
Accuracy: 0.875
Sensitivity: 0.875
[[16  4]
 [ 1 19]]
Specificity: 0.8


# OUTLIER DETECTION

#### remove Df_3_outlier

In [None]:
z_score_threshold = 2.5

df_3_normalized = train_df_3.drop(['y'], axis=1)
exceeds_threshold = np.abs(df_3_normalized) > z_score_threshold

# Calculate the number of features exceeding the threshold in each row
num_exceeding_per_row = np.sum(exceeds_threshold, axis=1)

# Total number of features
total_features = df_3_normalized.shape[1]
print(total_features)

# Calculate the 70% threshold of total features for comparison
features_threshold = 250

# Identify rows where the number of features exceeding the threshold is less than 80% of total features
rows_to_keep = num_exceeding_per_row < features_threshold

# Keep these rows, effectively removing the ones where 70% or more of the values exceed the threshold
filtered_data = df_3_normalized[rows_to_keep]

#filtered_data = pd.DataFtame(filtered_data) #convert pandas dataframe

# Output the dimensions of the original and filtered datasets for comparison
filtered_row_count = filtered_data.shape[0]

filtered_row_count

4094


76

In [None]:
z_score_threshold = 2.5

df_4_normalized = train_df_4.drop(['y'], axis=1)
exceeds_threshold = np.abs(df_4_normalized) > z_score_threshold

# Calculate the number of features exceeding the threshold in each row
num_exceeding_per_row = np.sum(exceeds_threshold, axis=1)

# Total number of features
total_features = df_4_normalized.shape[1]
print(total_features)

# Calculate the 70% threshold of total features for comparison
features_70_percent_threshold = 250

# Identify rows where the number of features exceeding the threshold is less than 80% of total features
rows_to_keep_2 = num_exceeding_per_row < features_70_percent_threshold

# Keep these rows, effectively removing the ones where 70% or more of the values exceed the threshold
filtered_data_2 = df_4_normalized[rows_to_keep_2]

#filtered_data = pd.DataFtame(filtered_data) #convert pandas dataframe

# Output the dimensions of the original and filtered datasets for comparison
filtered_row_count_2 = filtered_data_2.shape[0]

filtered_row_count_2

4094


75

In [None]:
filtered_data['y'] = 0 #DF 3
filtered_data_2['y'] = 1 #df 4


#merge the datasets
result = [filtered_data, filtered_data_2]
df_combined = pd.concat(result)


df_cleaned = df_combined.sample(frac=1, random_state=42).reset_index(drop=True) #shuffle

y_train_cleaned= df_cleaned['y']
X_train_cleaned=df_cleaned.drop('y', axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['y'] = 0 #DF 3
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['y'] = 1 #df 4


# CLEANED DATA MODEL EVALUATION

In [None]:
'''from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  # Example range for C
    'gamma': [1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],  # Example range for gamma
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
}

# Create a SVC model instance
svm_model = SVC(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(svm_model, param_grid, refit=True, verbose=3, cv=5)  # cv=5 for 5-fold cross-validation

# Fit the model with data
grid_search.fit(X_train_cleaned, y_train_cleaned)

# Review the best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)'''

'from sklearn.model_selection import GridSearchCV\n\n# Define the parameter grid\nparam_grid = {\n    \'C\': [0.1, 1, 10, 100, 1000],  # Example range for C\n    \'gamma\': [1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],  # Example range for gamma\n    \'kernel\': [\'linear\', \'poly\', \'rbf\', \'sigmoid\'],  # Kernel type\n}\n\n# Create a SVC model instance\nsvm_model = SVC(random_state=42)\n\n# Set up GridSearchCV\ngrid_search = GridSearchCV(svm_model, param_grid, refit=True, verbose=3, cv=5)  # cv=5 for 5-fold cross-validation\n\n# Fit the model with data\ngrid_search.fit(X_train_cleaned, y_train_cleaned)\n\n# Review the best parameters and score\nprint("Best parameters found:", grid_search.best_params_)\nprint("Best cross-validation score:", grid_search.best_score_)'

In [None]:
#Best parameters found:  OrderedDict([('C', 1000000.0), ('gamma', 0.000570349311114605), ('kernel', 'rbf')])
svm_classifier = svm.SVC(kernel='rbf', C= 10, gamma=0.0005, random_state=42)
svm_classifier.fit(X_train, y_train)



#  StratifiedKFold for cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True)

cross_val_scores = cross_val_score(svm_classifier, X_train, y_train, cv=kfold, scoring='accuracy')
print("Cross-validation scores:", cross_val_scores)

# Print the scores for each fold
for i, score in enumerate(cross_val_scores, 1):
    print(f"Fold {i}: Accuracy = {score}")

# Calculate and print the mean and standard deviation of the cross-validation scores
mean_score = np.mean(cross_val_scores)
std_dev = np.std(cross_val_scores)
print(f"Mean accuracy: {mean_score}")
print(f"Standard Deviation: {std_dev}")


y_pred = svm_classifier.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Sensitivity:", recall)

# Calculating specificity from the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
TN = conf_matrix[0][0]
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
specificity = TN / (TN + FP)
print(f"Specificity: {specificity}")

Cross-validation scores: [0.84375 0.59375 0.625   0.875   0.4375 ]
Fold 1: Accuracy = 0.84375
Fold 2: Accuracy = 0.59375
Fold 3: Accuracy = 0.625
Fold 4: Accuracy = 0.875
Fold 5: Accuracy = 0.4375
Mean accuracy: 0.675
Standard Deviation: 0.16369751067135993
Accuracy: 0.9
Precision: 0.9040404040404042
Sensitivity: 0.9
[[17  3]
 [ 1 19]]
Specificity: 0.85


# FEATURE SELECTION

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Apply PCA
pca = PCA(n_components=0.95)  # Choose the number of components that explain at least 95% of the variance
X_pca = pca.fit_transform(X_train_cleaned)


# Transform both training and test sets using the PCA object fitted on the training set
X_train_pca = pca.transform(X_train_cleaned)
X_test_pca = pca.transform(X_test)

print(X_test.shape)

# Define the parameter grid for GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000],  # Example range for C
              'gamma': [1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],  # Example range for gamma
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
              }  # Example range for gamma}

# Perform grid search with cross-validation
svm_classifier = SVC()
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5)
grid_search.fit(X_train_pca, y_train_cleaned)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Best SVM classifier
best_svm = grid_search.best_estimator_

# Evaluate the model
accuracy = best_svm.score(X_test_pca, y_test)
print("Accuracy:", accuracy)

# Get the prediction results
y_pred = best_svm.predict(X_test_pca)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')


print("Accuracy:", accuracy)
print("Sensitivity:", recall)

# Calculating specificity from the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
TP = conf_matrix[0][0]
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
specificity = TN / (TN + FP)
print(f"Specificity: {specificity}")
print(conf_matrix)

(40, 4094)
Best Hyperparameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy: 0.825
Accuracy: 0.825
Sensitivity: 0.825
Specificity: 0.85
[[17  3]
 [ 4 16]]


## K best


In [None]:
# apply SelectKBest
# Split the data into training and test sets
#X_train, X_test, y_train, y_test = train_test_split(normalized_data, normalized_labels, test_size=0.2, random_state=42)

# Perform feature selection
k = 50  # Choose the number of top features you want to select
selector = SelectKBest(score_func=mutual_info_classif, k=k) # try different score_func
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

#Define an SVM classifier
svm_classifier = SVC()

# Define hyperparameters to tune
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],  # Example range for gamma
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
}

# Perform grid search with cross-validation
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=folds)
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the best model
best_svm_classifier = grid_search.best_estimator_
accuracy = best_svm_classifier.score(X_test_selected, y_test)
print("Accuracy:", accuracy)

# Get the prediction results
y_pred = best_svm_classifier.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# Calculating specificity from the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
TN = conf_matrix[0][0]
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]
specificity = TN / (TN + FP)
print(f"Specificity: {specificity}")
print(conf_matrix)

Best Hyperparameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Accuracy: 0.9
Accuracy: 0.9
Precision: 0.9
Recall: 0.9
Specificity: 0.9
[[18  2]
 [ 2 18]]
