In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras import layers
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler



In [2]:
train_file = "data\\training_data.tsv"
hypotheticals_file = "data\\hypothetical_data.tsv"

train_dataset = pd.read_csv(train_file, na_values='?', sep="\t")
hypotheticals_dataset = pd.read_csv(hypotheticals_file, na_values='?', sep="\t")

train_df = train_dataset.copy()
hypo_df = hypotheticals_dataset.copy()

In [3]:
# Preparing the data
train_df = train_df.iloc[:, 2:]  # Remove the first two columns (ID, name)
hypo_df = hypo_df.iloc[:, 2:]

# Split the data into features and labels
X = train_df.iloc[:, :-15]  # Features
y = train_df.iloc[:, -15:]  # Labels

# Preparing the data for making predictions on hypo_df
X_hypo = hypo_df.iloc[:, :-15]  # Features
y_hypo = hypo_df.iloc[:, -15:]  # Labels

# Create a KFold cross-validator with 10 folds
kf = KFold(n_splits=10, random_state=0, shuffle=True)


NEW

In [4]:
# Initialize lists to store evaluation metrics for each fold
accuracy_scores = []
f1_scores = []

# Standardize the features for better performance of MLP
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_hypo_scaled = scaler.transform(X_hypo)

for train_index, test_index in kf.split(X_scaled):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    
    # Example: Using MLP as the classifier
    mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=0)
    mlp_history = mlp_classifier.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = mlp_classifier.predict(X_val)
    
    # Evaluate the model
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='micro')  # Micro-F1 is suitable for multi-label tasks
    
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

# Calculate the average evaluation metrics over all folds
avg_accuracy = np.mean(accuracy_scores)
avg_f1 = np.mean(f1_scores)

print("Average Accuracy:", avg_accuracy)
print("Average F1-score:", avg_f1)


In [None]:
# Make predictions on the new data
mlp_predictions = mlp_classifier.predict_proba(X_hypo_scaled)

# Convert the predictions to a DataFrame
mlp_predictions_df = pd.DataFrame(mlp_predictions, columns=y.columns)

# Display the predictions for new proteins
print("MLP Predictions for New Proteins:")
print(mlp_predictions_df)


In [None]:
# # Define a custom callback to store training history
# class LossHistory:
#     def __init__(self, model_name):
#         self.model_name = model_name
#         self.losses = []
#         self.val_losses = []

# training_histories = []
# model_names = ['Logistic Regression', 'Random Forest', 'MLP Neural Network']

Logistic Regression

In [None]:
# # Create the Logistic Regression model with One-vs-Rest strategy
# logreg_model = OneVsRestClassifier(LogisticRegression(max_iter=1000))

# # Perform cross-validation and make predictions
# y_pred_logreg = cross_val_predict(logreg_model, X_scaled, y, cv=kf)

# # Evaluate the model
# accuracy_logreg = accuracy_score(y, y_pred_logreg)
# precision_logreg = precision_score(y, y_pred_logreg, average='micro')
# recall_logreg = recall_score(y, y_pred_logreg, average='micro')
# f1_score_logreg = f1_score(y, y_pred_logreg, average='micro')

# print("Logistic Regression Results:")
# print(f"Accuracy: {accuracy_logreg}")
# print(f"Precision: {precision_logreg}")
# print(f"Recall: {recall_logreg}")
# print(f"F1-score: {f1_score_logreg}")


In [None]:
# # Plot the training process for the Logistic Regression model
# plt.figure(figsize=(8, 6))
# epochs = range(1, len(logreg_loss_history.losses) + 1)
# plt.plot(epochs, logreg_loss_history.losses, label='Training Loss')
# plt.plot(epochs, logreg_loss_history.val_losses, label='Validation Loss')
# plt.title('Training Process of Logistic Regression')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.show()

Random Forest

In [None]:
# # Create the Random Forest model
# rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

# # Perform cross-validation and make predictions
# rf_loss_history = LossHistory('Random Forest')
# y_pred_rf = cross_val_predict(rf_model, X, y, cv=kf)

# # Evaluate the model
# accuracy_rf = accuracy_score(y, y_pred_rf)
# precision_rf = precision_score(y, y_pred_rf, average='micro')
# recall_rf = recall_score(y, y_pred_rf, average='micro')
# f1_score_rf = f1_score(y, y_pred_rf, average='micro')

# training_histories.append(rf_loss_history)

# print("Random Forest Results:")
# print(f"Accuracy: {accuracy_rf}")
# print(f"Precision: {precision_rf}")
# print(f"Recall: {recall_rf}")
# print(f"F1-score: {f1_score_rf}")


In [None]:
# # Plot the training process for the Random Forest model
# plt.figure(figsize=(8, 6))
# epochs = range(1, len(rf_loss_history.losses) + 1)
# plt.plot(epochs, rf_loss_history.losses, label='Training Loss')
# plt.plot(epochs, rf_loss_history.val_losses, label='Validation Loss')
# plt.title('Training Process of Random Forest')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.show()

Multi-layer Perception

In [None]:
# # Create the MLP model
# mlp_model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=0)

# # Perform cross-validation and make predictions
# mlp_loss_history = LossHistory('MLP Neural Network')
# y_pred_mlp = cross_val_predict(mlp_model, X_scaled, y, cv=kf)

# # Evaluate the model
# accuracy_mlp = accuracy_score(y, y_pred_mlp)
# precision_mlp = precision_score(y, y_pred_mlp, average='micro')
# recall_mlp = recall_score(y, y_pred_mlp, average='micro')
# f1_score_mlp = f1_score(y, y_pred_mlp, average='micro')

# training_histories.append(mlp_loss_history)

# print("MLP Neural Network Results:")
# print(f"Accuracy: {accuracy_mlp}")
# print(f"Precision: {precision_mlp}")
# print(f"Recall: {recall_mlp}")
# print(f"F1-score: {f1_score_mlp}")


In [None]:
# # Plot the training process for the MLP model
# plt.figure(figsize=(8, 6))
# epochs = range(1, len(mlp_loss_history.losses) + 1)
# plt.plot(epochs, mlp_loss_history.losses, label='Training Loss')
# plt.plot(epochs, mlp_loss_history.val_losses, label='Validation Loss')
# plt.title('Training Process of MLP Neural Network')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.show()

In [None]:
# # Plot the evaluation metrics
# eval_df = pd.DataFrame({
#     'Model': model_names,
#     'Accuracy': [accuracy_logreg, accuracy_rf, accuracy_mlp],
#     'Precision': [precision_logreg, precision_rf, precision_mlp],
#     'Recall': [recall_logreg, recall_rf, recall_mlp],
#     'F1-score': [f1_score_logreg, f1_score_rf, f1_score_mlp]
# })

# plt.figure(figsize=(10, 6))
# sns.barplot(x='Model', y='Value', hue='Metric', data=pd.melt(eval_df, id_vars='Model'), palette='muted')
# plt.title('Model Evaluation Metrics')
# plt.xlabel('Model')
# plt.ylabel('Value')
# plt.legend(title='Metric', loc='upper right')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

In [None]:
# # Plot the training process for each model
# for idx, model_name in enumerate(model_names):
#     plt.figure(figsize=(8, 6))
#     epochs = range(1, len(training_histories[idx].losses) + 1)
#     plt.plot(epochs, training_histories[idx].losses, label='Training Loss')
#     plt.plot(epochs, training_histories[idx].val_losses, label='Validation Loss')
#     plt.title(f'Training Process of {model_name}')
#     plt.xlabel('Epoch')
#     plt.ylabel('Loss')
#     plt.legend()
#     plt.grid(True)
#     plt.tight_layout()
#     plt.show()

In [None]:
# # Predict probabilities for the hypothetical set using each model
# logreg_hypo_probabilities = logreg_model.predict_proba(X_hypo_scaled)
# rf_hypo_probabilities = rf_model.predict_proba(X_hypo)
# mlp_hypo_probabilities = mlp_model.predict_proba(X_hypo_scaled)

# # Convert the probabilities into DataFrames for easier analysis
# logreg_hypo_prob_df = pd.DataFrame(logreg_hypo_probabilities, columns=y.columns)
# rf_hypo_prob_df = pd.DataFrame(rf_hypo_probabilities, columns=y.columns)
# mlp_hypo_prob_df = pd.DataFrame(mlp_hypo_probabilities, columns=y.columns)

# # Display the DataFrames
# print("Predicted Probabilities for the Hypothetical Set - Logistic Regression:")
# print(logreg_hypo_prob_df)

# print("Predicted Probabilities for the Hypothetical Set - Random Forest:")
# print(rf_hypo_prob_df)

# print("Predicted Probabilities for the Hypothetical Set - MLP Neural Network:")
# print(mlp_hypo_prob_df)