In [33]:
%pip install scikit-learn





[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score

In [35]:
csvFile = pd.read_csv('IST_MIR.csv')
X = csvFile.iloc[:, 2:14].values
y = csvFile.iloc[:, 14].values

In [36]:
pca = PCA(n_components=0.95)
pca_X = pca.fit_transform(X)

# Define the scorers for precision, recall, F1, and AUC
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': 'roc_auc'
}

# Set up 10-fold cross-validation, repeated 10 times
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)

In [37]:
static_learners = ["LR", "CART", "NB", "KNN", "RF"]

precision_scores_df = pd.DataFrame(columns=static_learners)
recall_scores_df = pd.DataFrame(columns=static_learners)
f1_scores_df = pd.DataFrame(columns=static_learners)
auc_scores_df = pd.DataFrame(columns=static_learners)

In [38]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=10000)

# Perform cross-validation and get scores for each metric
cv_results = cross_validate(log_reg, pca_X, y, cv=cv, scoring=scoring)

# Calculate median scores
median_precision = np.median(cv_results['test_precision'])
median_recall = np.median(cv_results['test_recall'])
median_f1 = np.median(cv_results['test_f1'])
median_auc = np.median(cv_results['test_roc_auc'])


# Output the median scores
print(f"Median Precision over 10-fold cross-validation repeated 10 times: {median_precision:.2f}")
print(f"Median Recall over 10-fold cross-validation repeated 10 times: {median_recall:.2f}")
print(f"Median F1-score over 10-fold cross-validation repeated 10 times: {median_f1:.2f}")
print(f"Median AUC over 10-fold cross-validation repeated 10 times: {median_auc:.2f}")

Median Precision over 10-fold cross-validation repeated 10 times: 0.78
Median Recall over 10-fold cross-validation repeated 10 times: 0.60
Median F1-score over 10-fold cross-validation repeated 10 times: 0.71
Median AUC over 10-fold cross-validation repeated 10 times: 0.75


In [39]:
precision_scores = []
recall_scores = []
f1_scores = []
auc_scores = []

for i in range(0, len(cv_results['test_precision']), 10):
    subset = cv_results['test_precision'][i:i+10]
    median = np.median(subset)
    precision_scores.append(median)

for i in range(0, len(cv_results['test_recall']), 10):
    subset = cv_results['test_recall'][i:i+10]
    median = np.median(subset)
    recall_scores.append(median)

for i in range(0, len(cv_results['test_f1']), 10):
    subset = cv_results['test_f1'][i:i+10]
    median = np.median(subset)
    f1_scores.append(median)

for i in range(0, len(cv_results['test_roc_auc']), 10):
    subset = cv_results['test_roc_auc'][i:i+10]
    median = np.median(subset)
    auc_scores.append(median)

precision_scores_df['LR'] = precision_scores
recall_scores_df['LR'] = recall_scores
f1_scores_df['LR'] = f1_scores
auc_scores_df['LR'] = auc_scores

In [40]:
# CART
dtclf = DecisionTreeClassifier()

# Perform cross-validation and get scores for each metric
cv_results = cross_validate(dtclf, pca_X, y, cv=cv, scoring=scoring)

# Calculate median scores
median_precision = np.median(cv_results['test_precision'])
median_recall = np.median(cv_results['test_recall'])
median_f1 = np.median(cv_results['test_f1'])
median_auc = np.median(cv_results['test_roc_auc'])

# Output the median scores
print(f"Median Precision over 10-fold cross-validation repeated 10 times: {median_precision:.2f}")
print(f"Median Recall over 10-fold cross-validation repeated 10 times: {median_recall:.2f}")
print(f"Median F1-score over 10-fold cross-validation repeated 10 times: {median_f1:.2f}")
print(f"Median AUC over 10-fold cross-validation repeated 10 times: {median_auc:.2f}")

Median Precision over 10-fold cross-validation repeated 10 times: 0.70
Median Recall over 10-fold cross-validation repeated 10 times: 0.70
Median F1-score over 10-fold cross-validation repeated 10 times: 0.71
Median AUC over 10-fold cross-validation repeated 10 times: 0.68


In [41]:
precision_scores = []
recall_scores = []
f1_scores = []
auc_scores = []

for i in range(0, len(cv_results['test_precision']), 10):
    subset = cv_results['test_precision'][i:i+10]
    median = np.median(subset)
    precision_scores.append(median)

for i in range(0, len(cv_results['test_recall']), 10):
    subset = cv_results['test_recall'][i:i+10]
    median = np.median(subset)
    recall_scores.append(median)

for i in range(0, len(cv_results['test_f1']), 10):
    subset = cv_results['test_f1'][i:i+10]
    median = np.median(subset)
    f1_scores.append(median)

for i in range(0, len(cv_results['test_roc_auc']), 10):
    subset = cv_results['test_roc_auc'][i:i+10]
    median = np.median(subset)
    auc_scores.append(median)

precision_scores_df['CART'] = precision_scores
recall_scores_df['CART'] = recall_scores
f1_scores_df['CART'] = f1_scores
auc_scores_df['CART'] = auc_scores

In [42]:
# Naive Bayes
gnb = GaussianNB()

# Perform cross-validation and get scores for each metric
cv_results = cross_validate(gnb, pca_X, y, cv=cv, scoring=scoring)

# Calculate median scores
median_precision = np.median(cv_results['test_precision'])
median_recall = np.median(cv_results['test_recall'])
median_f1 = np.median(cv_results['test_f1'])
median_auc = np.median(cv_results['test_roc_auc'])

precision_scores.append(median_precision)
recall_scores.append(median_recall)
f1_scores.append(median_f1)
auc_scores.append(median_auc)

# Output the median scores
print(f"Median Precision over 10-fold cross-validation repeated 10 times: {median_precision:.2f}")
print(f"Median Recall over 10-fold cross-validation repeated 10 times: {median_recall:.2f}")
print(f"Median F1-score over 10-fold cross-validation repeated 10 times: {median_f1:.2f}")
print(f"Median AUC over 10-fold cross-validation repeated 10 times: {median_auc:.2f}")

Median Precision over 10-fold cross-validation repeated 10 times: 0.83
Median Recall over 10-fold cross-validation repeated 10 times: 0.33
Median F1-score over 10-fold cross-validation repeated 10 times: 0.50
Median AUC over 10-fold cross-validation repeated 10 times: 0.71


In [43]:
precision_scores = []
recall_scores = []
f1_scores = []
auc_scores = []

for i in range(0, len(cv_results['test_precision']), 10):
    subset = cv_results['test_precision'][i:i+10]
    median = np.median(subset)
    precision_scores.append(median)

for i in range(0, len(cv_results['test_recall']), 10):
    subset = cv_results['test_recall'][i:i+10]
    median = np.median(subset)
    recall_scores.append(median)

for i in range(0, len(cv_results['test_f1']), 10):
    subset = cv_results['test_f1'][i:i+10]
    median = np.median(subset)
    f1_scores.append(median)

for i in range(0, len(cv_results['test_roc_auc']), 10):
    subset = cv_results['test_roc_auc'][i:i+10]
    median = np.median(subset)
    auc_scores.append(median)

precision_scores_df['NB'] = precision_scores
recall_scores_df['NB'] = recall_scores
f1_scores_df['NB'] = f1_scores
auc_scores_df['NB'] = auc_scores

In [44]:
# KNN
knnclf = KNeighborsClassifier(n_neighbors=11)

# Perform cross-validation and get scores for each metric
cv_results = cross_validate(knnclf, pca_X, y, cv=cv, scoring=scoring)

# Calculate median scores
median_precision = np.median(cv_results['test_precision'])
median_recall = np.median(cv_results['test_recall'])
median_f1 = np.median(cv_results['test_f1'])
median_auc = np.median(cv_results['test_roc_auc'])

precision_scores.append(median_precision)
recall_scores.append(median_recall)
f1_scores.append(median_f1)
auc_scores.append(median_auc)

# Output the median scores
print(f"Median Precision over 10-fold cross-validation repeated 10 times: {median_precision:.2f}")
print(f"Median Recall over 10-fold cross-validation repeated 10 times: {median_recall:.2f}")
print(f"Median F1-score over 10-fold cross-validation repeated 10 times: {median_f1:.2f}")
print(f"Median AUC over 10-fold cross-validation repeated 10 times: {median_auc:.2f}")

Median Precision over 10-fold cross-validation repeated 10 times: 0.68
Median Recall over 10-fold cross-validation repeated 10 times: 0.67
Median F1-score over 10-fold cross-validation repeated 10 times: 0.67
Median AUC over 10-fold cross-validation repeated 10 times: 0.71


In [45]:
precision_scores = []
recall_scores = []
f1_scores = []
auc_scores = []

for i in range(0, len(cv_results['test_precision']), 10):
    subset = cv_results['test_precision'][i:i+10]
    median = np.median(subset)
    precision_scores.append(median)

for i in range(0, len(cv_results['test_recall']), 10):
    subset = cv_results['test_recall'][i:i+10]
    median = np.median(subset)
    recall_scores.append(median)

for i in range(0, len(cv_results['test_f1']), 10):
    subset = cv_results['test_f1'][i:i+10]
    median = np.median(subset)
    f1_scores.append(median)

for i in range(0, len(cv_results['test_roc_auc']), 10):
    subset = cv_results['test_roc_auc'][i:i+10]
    median = np.median(subset)
    auc_scores.append(median)

precision_scores_df['KNN'] = precision_scores
recall_scores_df['KNN'] = recall_scores
f1_scores_df['KNN'] = f1_scores
auc_scores_df['KNN'] = auc_scores

In [46]:
# Random Forest
rfclf = RandomForestClassifier()

# Perform cross-validation and get scores for each metric
cv_results = cross_validate(rfclf, pca_X, y, cv=cv, scoring=scoring)

# Calculate median scores
median_precision = np.median(cv_results['test_precision'])
median_recall = np.median(cv_results['test_recall'])
median_f1 = np.median(cv_results['test_f1'])
median_auc = np.median(cv_results['test_roc_auc'])

precision_scores.append(median_precision)
recall_scores.append(median_recall)
f1_scores.append(median_f1)
auc_scores.append(median_auc)

# Output the median scores
print(f"Median Precision over 10-fold cross-validation repeated 10 times: {median_precision:.2f}")
print(f"Median Recall over 10-fold cross-validation repeated 10 times: {median_recall:.2f}")
print(f"Median F1-score over 10-fold cross-validation repeated 10 times: {median_f1:.2f}")
print(f"Median AUC over 10-fold cross-validation repeated 10 times: {median_auc:.2f}")


Median Precision over 10-fold cross-validation repeated 10 times: 0.69
Median Recall over 10-fold cross-validation repeated 10 times: 0.70
Median F1-score over 10-fold cross-validation repeated 10 times: 0.70
Median AUC over 10-fold cross-validation repeated 10 times: 0.70


In [47]:
precision_scores = []
recall_scores = []
f1_scores = []
auc_scores = []

for i in range(0, len(cv_results['test_precision']), 10):
    subset = cv_results['test_precision'][i:i+10]
    median = np.median(subset)
    precision_scores.append(median)

for i in range(0, len(cv_results['test_recall']), 10):
    subset = cv_results['test_recall'][i:i+10]
    median = np.median(subset)
    recall_scores.append(median)

for i in range(0, len(cv_results['test_f1']), 10):
    subset = cv_results['test_f1'][i:i+10]
    median = np.median(subset)
    f1_scores.append(median)

for i in range(0, len(cv_results['test_roc_auc']), 10):
    subset = cv_results['test_roc_auc'][i:i+10]
    median = np.median(subset)
    auc_scores.append(median)

precision_scores_df['RF'] = precision_scores
recall_scores_df['RF'] = recall_scores
f1_scores_df['RF'] = f1_scores
auc_scores_df['RF'] = auc_scores

In [48]:
precision_scores_df.to_csv('precision_ml.csv', index=False)
recall_scores_df.to_csv('recall_ml.csv', index=False)
f1_scores_df.to_csv('f1_ml.csv', index=False)
auc_scores_df.to_csv('auc_ml.csv', index=False)