<a href="https://colab.research.google.com/github/kdemertzis/DUTh/blob/main/Tegos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Load your binary dataset from a CSV file (replace 'your_dataset.csv' with your file)
df = pd.read_csv('tegos_p2.csv')

# Assuming the target column is named 'target', and features are in other columns
X = df.drop('CVAc2', axis=1)
y = df['CVAc2']

# List of classifiers to compare
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Support Vector Machine', SVC()),
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', GaussianNB())
]

results = []

# Perform 10-fold cross-validation for each classifier
for clf_name, clf in classifiers:
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, X, y, cv=kf)

    # Calculate metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    results.append({
        'Classifier': clf_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'True Positive (TP)': tp,
        'True Negative (TN)': tn,
        'False Positive (FP)': fp,
        'False Negative (FN)': fn
    })

# Create a DataFrame to store the results
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('classifier_comparison_results.csv', index=False)

# Print the results to the console
print(results_df)


               Classifier  Accuracy  Precision    Recall  F1-Score  \
0           Decision Tree  0.760430   0.307087  0.136602  0.189091   
1           Random Forest  0.769024   0.324645  0.119965  0.175192   
2  Support Vector Machine  0.793912   0.000000  0.000000  0.000000   
3     Logistic Regression  0.795524   0.000000  0.000000  0.000000   
4             Naive Bayes  0.758460   0.346667  0.204904  0.257567   

   True Positive (TP)  True Negative (TN)  False Positive (FP)  \
0                 156                4091                  352   
1                 137                4158                  285   
2                   0                4434                    9   
3                   0                4443                    0   
4                 234                4002                  441   

   False Negative (FN)  
0                  986  
1                 1005  
2                 1142  
3                 1142  
4                  908  


  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Load your binary dataset from a CSV file (replace 'your_dataset.csv' with your file)
df = pd.read_csv('tegos_p2.csv')

# Assuming the target column is named 'target', and features are in other columns
X = df.drop('CVAc2', axis=1)
y = df['CVAc2']

# List of classifiers to compare
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Support Vector Machine', SVC()),
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', GaussianNB())
]

# Create a new DataFrame to store the results
results_df = pd.DataFrame()

# Perform 10-fold cross-validation for each classifier
for clf_name, clf in classifiers:
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, X, y, cv=kf)

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    # Add a new row for each instance with classifier results
    instance_results = pd.DataFrame({
        'Classifier': clf_name,
        'Predicted': y_pred,
        'True': y,
        'True Positive (TP)': np.where((y == 1) & (y_pred == 1), 'TP', ''),
        'True Negative (TN)': np.where((y == 0) & (y_pred == 0), 'TN', ''),
        'False Positive (FP)': np.where((y == 0) & (y_pred == 1), 'FP', ''),
        'False Negative (FN)': np.where((y == 1) & (y_pred == 0), 'FN', '')
    })

    results_df = pd.concat([results_df, instance_results], ignore_index=True)

# Save the results to a CSV file
results_df.to_csv('classifier_comparison_results_with_instances.csv', index=False)

# Print the results to the console
print(results_df)


          Classifier  Predicted  True True Positive (TP) True Negative (TN)  \
0      Decision Tree          0     0                                    TN   
1      Decision Tree          0     0                                    TN   
2      Decision Tree          0     0                                    TN   
3      Decision Tree          0     1                                         
4      Decision Tree          0     1                                         
...              ...        ...   ...                ...                ...   
27920    Naive Bayes          0     0                                    TN   
27921    Naive Bayes          0     0                                    TN   
27922    Naive Bayes          0     0                                    TN   
27923    Naive Bayes          0     0                                    TN   
27924    Naive Bayes          0     0                                    TN   

      False Positive (FP) False Negative (FN)  
0  

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Load your binary dataset from a CSV file (replace 'your_dataset.csv' with your file)
df = pd.read_csv('tegos_p2.csv')

# Assuming the target column is named 'target', and features are in other columns
X = df.drop('CVAc2', axis=1)
y = df['CVAc2']

# List of classifiers to compare
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Support Vector Machine', SVC()),
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', GaussianNB())
]

# Create a new DataFrame to store the results
results_df = pd.DataFrame()

# Perform 10-fold cross-validation for each classifier
for clf_name, clf in classifiers:
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, X, y, cv=kf)

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    # Create a DataFrame with instance-level results for this classifier
    instance_results = pd.DataFrame({
        'Classifier': clf_name,
        'Predicted': y_pred,
        'True': y,
        'Result': np.where((y == 1) & (y_pred == 1), 'TP',
                    np.where((y == 0) & (y_pred == 0), 'TN',
                    np.where((y == 0) & (y_pred == 1), 'FP', 'FN')))
    })

    results_df = pd.concat([results_df, instance_results], ignore_index=True)

# Save the results to a CSV file
results_df.to_csv('classifier_comparison_results_with_dataset_instances.csv', index=False)

# Print the results to the console
print(results_df)


          Classifier  Predicted  True Result
0      Decision Tree          0     0     TN
1      Decision Tree          0     0     TN
2      Decision Tree          0     0     TN
3      Decision Tree          0     1     FN
4      Decision Tree          0     1     FN
...              ...        ...   ...    ...
27920    Naive Bayes          0     0     TN
27921    Naive Bayes          0     0     TN
27922    Naive Bayes          0     0     TN
27923    Naive Bayes          0     0     TN
27924    Naive Bayes          0     0     TN

[27925 rows x 4 columns]


In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Create a DataFrame with the dataset
data = {
    'LICA1': [0, 30],
    'RICA1': [0, 0],
    'LIMT1': [0.5, 0.9],
    'RIMT1': [0.6, 0.8],
    'CVAc2': [0, 0],
    'target': [0, 0]  # The target values will be overwritten later
}
df = pd.DataFrame(data)

# Define the target values (0 for the first row and 1 for the second row)
df['target'] = [0, 1]

# List of classifiers to compare
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Support Vector Machine', SVC()),
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', GaussianNB())
]

# Create a new DataFrame to store the results
results_df = pd.DataFrame()

# Perform classification for each classifier
for clf_name, clf in classifiers:
    # Fit the classifier
    clf.fit(df.drop('target', axis=1), df['target'])

    # Make predictions
    y_pred = clf.predict(df.drop('target', axis=1))

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(df['target'], y_pred).ravel()

    # Create a DataFrame with instance-level results for this classifier
    instance_results = pd.DataFrame({
        'Classifier': [clf_name] * len(df),
        'LICA1': df['LICA1'],
        'RICA1': df['RICA1'],
        'LIMT1': df['LIMT1'],
        'RIMT1': df['RIMT1'],
        'CVAc2': df['CVAc2'],
        'Predicted': np.where((df['target'] == 1) & (y_pred == 1), 'TP',
                    np.where((df['target'] == 0) & (y_pred == 0), 'TN',
                    np.where((df['target'] == 0) & (y_pred == 1), 'FP', 'FN')))
    })

    results_df = pd.concat([results_df, instance_results], ignore_index=True)

# Save the results to a CSV file
results_df.to_csv('classifier_comparison_results_with_dataset_instances.csv', index=False)

# Print the results to the console
print(results_df)


               Classifier  LICA1  RICA1  LIMT1  RIMT1  CVAc2 Predicted
0           Decision Tree      0      0    0.5    0.6      0        TN
1           Decision Tree     30      0    0.9    0.8      0        TP
2           Random Forest      0      0    0.5    0.6      0        TN
3           Random Forest     30      0    0.9    0.8      0        TP
4  Support Vector Machine      0      0    0.5    0.6      0        TN
5  Support Vector Machine     30      0    0.9    0.8      0        TP
6     Logistic Regression      0      0    0.5    0.6      0        TN
7     Logistic Regression     30      0    0.9    0.8      0        TP
8             Naive Bayes      0      0    0.5    0.6      0        TN
9             Naive Bayes     30      0    0.9    0.8      0        TP


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Load your binary dataset from a CSV file (replace 'your_dataset.csv' with your file)
df = pd.read_csv('tegos_p2.csv')

# List of classifiers to compare
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Support Vector Machine', SVC()),
    ('Logistic Regression', LogisticRegression()),
    ('Naive Bayes', GaussianNB())
]

# Create a new DataFrame to store the results
results_df = pd.DataFrame()

# Perform classification for each classifier
for clf_name, clf in classifiers:
    # Fit the classifier
    clf.fit(df.drop('CVAc2', axis=1), df['CVAc2'])

    # Make predictions
    y_pred = clf.predict(df.drop('CVAc2', axis=1))

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(df['CVAc2'], y_pred).ravel()

    # Create a DataFrame with instance-level results for this classifier
    instance_results = pd.DataFrame({
        'Classifier': [clf_name] * len(df),
        'LICA1': df['LICA1'],
        'RICA1': df['RICA1'],
        'LIMT1': df['LIMT1'],
        'RIMT1': df['RIMT1'],
        'CVAc2': df['CVAc2'],
        'Predicted': np.where((df['CVAc2'] == 1) & (y_pred == 1), 'TP',
                    np.where((df['CVAc2'] == 0) & (y_pred == 0), 'TN',
                    np.where((df['CVAc2'] == 0) & (y_pred == 1), 'FP', 'FN')))
    })

    results_df = pd.concat([results_df, instance_results], ignore_index=True)

# Save the results to a CSV file
results_df.to_csv('classifier_w.csv', index=False)

# Print the results to the console
print(results_df)


          Classifier      LICA1      RICA1  LIMT1  RIMT1  CVAc2 Predicted
0      Decision Tree   0.000000   0.000000    0.5    0.6      0        TN
1      Decision Tree  30.000000   0.000000    0.9    0.8      0        TN
2      Decision Tree  20.000000   0.000000    1.1    1.3      0        TN
3      Decision Tree   0.000000   0.000000    0.6    0.6      1        FN
4      Decision Tree   0.000000   0.000000    0.8    0.7      1        FN
...              ...        ...        ...    ...    ...    ...       ...
27920    Naive Bayes   0.000000   0.000000    0.6    0.7      0        TN
27921    Naive Bayes   0.000000   0.000000    0.8    0.7      0        TN
27922    Naive Bayes  14.335554  14.304752    0.5    0.8      0        TN
27923    Naive Bayes   0.000000   0.000000    0.8    0.9      0        TN
27924    Naive Bayes   0.000000   0.000000    0.6    0.5      0        TN

[27925 rows x 7 columns]


In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Load your binary dataset from a CSV file (replace 'your_dataset.csv' with your file)
df = pd.read_csv('tegos_p2.csv')

# List of classifiers to compare
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Support Vector Machine', SVC(probability=True)),  # Note: Enable probability for AUC
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Naive Bayes', GaussianNB()),
]

# Create a list to store classifier results as dictionaries
results_list = []

# Perform classification for each classifier
for clf_name, clf in classifiers:
    # Fit the classifier
    clf.fit(df.drop('CVAc2', axis=1), df['CVAc2'])

    # Make predictions
    y_pred = clf.predict(df.drop('CVAc2', axis=1))
    y_prob = clf.predict_proba(df.drop('CVAc2', axis=1))[:, 1]  # Probability of class 1 for AUC

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(df['CVAc2'], y_pred).ravel()

    # Calculate classification metrics
    accuracy = accuracy_score(df['CVAc2'], y_pred)
    precision = precision_score(df['CVAc2'], y_pred)
    recall = recall_score(df['CVAc2'], y_pred)
    f_score = f1_score(df['CVAc2'], y_pred)
    auc = roc_auc_score(df['CVAc2'], y_prob)

    # Create a dictionary with classifier results
    classifier_results = {
        'Classifier': clf_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f_score,
        'AUC': auc,
        'True Positive (TP)': tp,
        'True Negative (TN)': tn,
        'False Positive (FP)': fp,
        'False Negative (FN)': fn,
    }

    results_list.append(classifier_results)

    # Print the confusion matrix and metrics for the current classifier
    print(f'Classifier: {clf_name}')
    print('Confusion Matrix:')
    print(confusion_matrix(df['CVAc2'], y_pred))
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f_score}')
    print(f'AUC: {auc}')
    print('\n')

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results_list)

# Save the results to a CSV file
results_df.to_csv('classifier_comparison_results_with_metrics.csv', index=False)

# Print the results DataFrame to the console
print(results_df)


Classifier: Decision Tree
Confusion Matrix:
[[4420   23]
 [ 728  414]]
Accuracy: 0.8655326768128917
Precision: 0.9473684210526315
Recall: 0.36252189141856395
F1-Score: 0.5243825205826473
AUC: 0.8540595746156906


Classifier: Random Forest
Confusion Matrix:
[[4389   54]
 [ 697  445]]
Accuracy: 0.8655326768128917
Precision: 0.8917835671342685
Recall: 0.38966725043782835
F1-Score: 0.5423522242535039
AUC: 0.837141839048654


Classifier: Support Vector Machine
Confusion Matrix:
[[4443    0]
 [1142    0]]
Accuracy: 0.7955237242614145
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
AUC: 0.5379551769386346


Classifier: Logistic Regression
Confusion Matrix:
[[4443    0]
 [1142    0]]
Accuracy: 0.7955237242614145
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
AUC: 0.5597673666008003


Classifier: Naive Bayes
Confusion Matrix:
[[4009  434]
 [ 907  235]]
Accuracy: 0.7598925693822739
Precision: 0.351270553064275
Recall: 0.20577933450087565
F1-Score: 0.25952512424075097
AUC: 0.5824938420223


               Cla

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
