<a href="https://colab.research.google.com/github/heroza/Keel/blob/main/pima%203x5cv%20sia%20smote.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import train_test_split

# load breast cancer dataset
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# define SVM classifier and parameter grid
svc = svm.SVC(kernel='rbf', probability=True, class_weight='balanced')
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.01, 0.1, 1, 10, 100]}

# define scoring metrics
scorers = {
    'f1_score': make_scorer(f1_score, average='macro'),
    'roc_auc_score': make_scorer(roc_auc_score),
    'g_mean': make_scorer(balanced_accuracy_score)
}

# use GridSearchCV to search for best parameter values
grid_search = GridSearchCV(svc, param_grid=param_grid, scoring=scorers, refit='f1_score', cv=5)
grid_search.fit(X_train, y_train)

# print best parameter values and corresponding scores for each metric
print("Best parameters found:", grid_search.best_params_)
print("Best F1 score found:", grid_search.best_score_)
print("Best ROC AUC score found:", grid_search.cv_results_['mean_test_roc_auc_score'][grid_search.best_index_])
print("Best G-Mean score found:", grid_search.cv_results_['mean_test_g_mean'][grid_search.best_index_])

Best parameters found: {'C': 1, 'gamma': 0.01}
Best F1 score found: 0.39917131326100763
Best ROC AUC score found: 0.5067816091954023
Best G-Mean score found: 0.5067816091954023


In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X = iris.data
y = iris.target

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# define SVM classifier and parameter grid
svc = svm.SVC(kernel='rbf', probability=True, class_weight='balanced')
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.01, 0.1, 1, 10, 100]}

# define scoring metrics
scorers = {
    'f1_score': make_scorer(f1_score, average='macro'),
    'roc_auc_score': make_scorer(roc_auc_score),
    'g_mean': make_scorer(balanced_accuracy_score)
}

# use GridSearchCV to search for best parameter values
grid_search = GridSearchCV(svc, param_grid=param_grid, scoring=scorers, refit='f1_score', cv=5)
grid_search.fit(X_train, y_train)

# print best parameter values and corresponding scores for each metric
print("Best parameters found:", grid_search.best_params_)
print("Best F1 score found:", grid_search.best_score_)
print("Best ROC AUC score found:", grid_search.cv_results_['mean_test_roc_auc_score'][grid_search.best_index_])
print("Best G-Mean score found:", grid_search.cv_results_['mean_test_g_mean'][grid_search.best_index_])


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
%load_ext autoreload
%autoreload 2

In [9]:
import myfunction as fu
import numpy as np
import pandas as pd
from typing_extensions import Counter
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [40]:
df = pd.read_csv('pima.dat', header=None)


In [41]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,positive
1,1,85,66,29,0,26.6,0.351,31,negative
2,8,183,64,0,0,23.3,0.672,32,positive
3,1,89,66,23,94,28.1,0.167,21,negative
4,0,137,40,35,168,43.1,2.288,33,positive


In [42]:
num_features=8

le = LabelEncoder()
X = df.loc[:, df.columns[0:num_features]].to_numpy()

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

y = df.loc[:, df.columns[-1]].to_numpy()
y = le.fit_transform(y)
# y_train = y_train.reshape(-1)
print(X.shape)
print(Counter(y))

(768, 8)
Counter({0: 500, 1: 268})


In [None]:
thresholds = [0.02, 0.05, 0.1, 0.3, 0.5]
ks = [3,5,7,9]
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
num_iteration=3
for k in ks:
  for threshold in thresholds:
    scores_cv_knn = []
    scores_cv_svc = []
    scores_cv_rf = []
    
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
      # print(f"Fold {fold+1}")
      X_train, X_test = X[train_index], X[test_index]
      y_train, y_test = y[train_index], y[test_index]
      # scores_all_iter_knn = []
      # scores_all_iter_svc = []
      # scores_all_iter_rf = []
      scores_all_iter = []
      for i in range(num_iteration):
        # random oversampling
        # oversample = RandomOverSampler(sampling_strategy='minority')
        # X_train_resampled, y_train_resampled = oversample.fit_resample(X, y)

        # sia smote
        sia_model = fu.train_siamese_dense(X_train, y_train, X_test, y_test, num_classes=2, num_features=num_features, epochs_siamese=100, batch_size=16)
        X_train_resampled, y_train_resampled = fu.sia_smote_balance_dense(X_train, y_train, sia_model, threshold, maxdist_from_base=0.5, k = k)

        score_clf = fu.train_and_get_score(X_train_resampled, y_train_resampled, X_test, y_test)
        # scores_all_iter_knn.append(score_knn)
        # scores_all_iter_svc.append(score_svc)
        # scores_all_iter_rf.append(score_rf)
        scores_all_iter.append(score_clf)
      # finish all iter, count mean
      mean_score = fu.mean_score(scores_all_iter)
      # print(f"Score_clf in fold:{fold}:", mean_score)
      scores_cv_knn.append(mean_score[0])
      scores_cv_svc.append(mean_score[1])
      scores_cv_rf.append(mean_score[2])

    print(f'k={k} and threshold={threshold}')
    print('printing score knn')
    fu.print_score(scores_cv_knn)
    print('printing score svc')
    fu.print_score(scores_cv_svc)
    print('printing score rf')
    fu.print_score(scores_cv_rf)

creating pairs
Counter({1.0: 426, 0.0: 426})
Counter({1.0: 106, 0.0: 106})
init base network
training siamese network...
evaluate
predict train
compute train acc
predict val
compute val acc
Loss = 0.25628432631492615, Train Accuracy = 0.8051643192488263 Test Accuracy = 0.5849056603773585
creating pairs
Counter({1.0: 426, 0.0: 426})
Counter({1.0: 106, 0.0: 106})
init base network
training siamese network...
evaluate
predict train
compute train acc
predict val
compute val acc
Loss = 0.24819093942642212, Train Accuracy = 0.82981220657277 Test Accuracy = 0.6132075471698113
creating pairs
Counter({1.0: 426, 0.0: 426})
Counter({1.0: 106, 0.0: 106})
init base network
training siamese network...
evaluate
predict train
compute train acc
predict val
compute val acc
Loss = 0.2903984487056732, Train Accuracy = 0.818075117370892 Test Accuracy = 0.6132075471698113
creating pairs
Counter({1.0: 426, 0.0: 426})
Counter({1.0: 106, 0.0: 106})
init base network
training siamese network...
evaluate
predict

In [29]:
score_df2 = pd.DataFrame(scores_cv)


In [39]:
len(score_df2[3])

KeyError: ignored

In [27]:
for score_df in score_df2:
  print("next!")
  Acc_arr = score_df[0].tolist()
  bal_acc_arr = score_df[1].tolist()
  G_Mean_arr = score_df[2].tolist()
  F_Measure_arr = score_df[3].tolist()
  Precision_arr = score_df[4].tolist()
  Sensitivity_arr = score_df[5].tolist()
  Specificity_arr = score_df[6].tolist()
  Auc_arr = score_df[7].tolist()

  print('bal_acc_arr: mean=%.3f std=%.3f' % (np.mean(bal_acc_arr)*100, np.std(bal_acc_arr)*100))
  print('G_Mean_arr: mean=%.3f std=%.3f' % (np.mean(G_Mean_arr)*100, np.std(G_Mean_arr)*100))
  print('F_Measure_arr: mean=%.3f std=%.3f' % (np.mean(F_Measure_arr)*100, np.std(F_Measure_arr)*100))
  print('Auc_arr: mean=%.3f std=%.3f' % (np.mean(Auc_arr)*100, np.std(Auc_arr)*100))
  print('Precision_arr: mean=%.3f std=%.3f' % (np.mean(Precision_arr)*100, np.std(Precision_arr)*100))
  print('Sensitivity_arr: mean=%.3f std=%.3f' % (np.mean(Sensitivity_arr)*100, np.std(Sensitivity_arr)*100))
  print('Specificity_arr: mean=%.3f std=%.3f' % (np.mean(Specificity_arr)*100, np.std(Specificity_arr)*100))
  print('Acc_arr: mean=%.3f std=%.3f' % (np.mean(Acc_arr)*100, np.std(Acc_arr)*100))

next!


TypeError: ignored

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
thresholds = [0.02, 0.05, 0.1, 0.3, 0.5]
ks = [3,5,7,9]
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for k in ks:
  for threshold in thresholds:
    scores = []
    for i in range(3):
      for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        # print(f"Fold {fold+1}")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        sia_model = fu.train_siamese_dense(X_train, y_train, X_test, y_test, num_classes=2, num_features=num_features, epochs_siamese=100, batch_size=16)

        # Initialize the SVM classifier
        clf = svm.SVC()

        X_train_resampled, y_train_resampled = fu.sia_smote_then_smote_dense(X_train, y_train, sia_model, threshold, maxdist_from_base=0.5, k = k)

        # Train the classifier on the training set
        clf.fit(X_train_resampled, y_train_resampled)

        # Predict the labels of the test set
        y_pred = clf.predict(X_test)

        # Calculate the F1 score
        score = fu.calculate_score(y_test, y_pred)
        scores.append(score)

        # Print the F1 score
        print(f"Score in i:{i}, fold:{fold}:", score)

    print(f'k={k} and threshold={threshold}')
    print('printing score')
    fu.print_score(scores)

creating pairs
Counter({1.0: 126, 0.0: 126})
Counter({1.0: 32, 0.0: 32})
init base network
training siamese network...
evaluate
predict train
compute train acc
predict val
compute val acc
Loss = 0.24843288958072662, Train Accuracy = 0.7023809523809523 Test Accuracy = 0.5
Score in i:0, fold:0: [0.5483870967741935, 0.5973856209150327, 0.587450456818008, 0.46153846153846156, 0.34285714285714286, 0.7058823529411765, 0.4888888888888889, 0.5973856209150328]
creating pairs
Counter({1.0: 128, 0.0: 128})
Counter({1.0: 30, 0.0: 30})
init base network
training siamese network...
evaluate
predict train
compute train acc
predict val
compute val acc
Loss = 0.2795407176017761, Train Accuracy = 0.65234375 Test Accuracy = 0.6
Score in i:0, fold:1: [0.6721311475409836, 0.6368055555555556, 0.6324555320336759, 0.47368421052631576, 0.4090909090909091, 0.5625, 0.7111111111111111, 0.6368055555555556]
creating pairs
Counter({1.0: 128, 0.0: 128})
Counter({1.0: 30, 0.0: 30})
init base network
training siamese n