In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier




In [None]:
train_file = "data\\training_data.tsv"
hypotheticals_file = "data\\hypothetical_data.tsv"

train_dataset = pd.read_csv(train_file, na_values='?', sep="\t")
hypotheticals_dataset = pd.read_csv(hypotheticals_file, na_values='?', sep="\t")

train_df = train_dataset.copy()
hypo_df = hypotheticals_dataset.copy()

# Preparing the data
train_df = train_df.iloc[:, 2:]  # Remove the first two columns (ID, name)
hypo_df = hypo_df.iloc[:, 2:]

# Split the data into features and labels
X = train_df.iloc[:, :-15]  # Features
y = train_df.iloc[:, -15:]  # Labels

# Preparing the data for making predictions on hypo_df
X_hypo = hypo_df.iloc[:, :-15]  # Features
y_hypo = hypo_df.iloc[:, -15:]  # Labels


Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

rf_accuracy = np.mean(cross_val_score(rf_model, X, y, cv=10))

print(rf_accuracy)

In [None]:
rf_model.fit(X, y)

# Make predictions on the new data
rf_predictions = rf_model.predict(X_hypo)

# Convert the predictions to a DataFrame
rf_predictions_df = pd.DataFrame(rf_predictions, columns=y_hypo.columns)

# Display the predictions for new proteins
print("RF Predictions for Hypothetical Proteins:")
print(rf_predictions_df)

# Save the DataFrame to the CSV file
rf_file = "rf_predictions.csv"
rf_predictions_df.to_csv(rf_file, index=False)


Histogram Gradient Boosting Classifier

In [None]:
gb_model = MultiOutputClassifier(HistGradientBoostingClassifier(random_state=0))

gb_accuracy = np.mean(cross_val_score(gb_model, X, y, cv=10))

print(gb_accuracy)

In [None]:
gb_model.fit(X, y)

# Make predictions on the new data
gb_predictions = gb_model.predict(X_hypo)

# Convert the predictions to a DataFrame
gb_predictions_df = pd.DataFrame(gb_predictions, columns=y_hypo.columns)

# Display the predictions for new proteins
print("GB Predictions for Hypothetical Proteins:")
print(gb_predictions_df)

# Save the DataFrame to the CSV file
gb_file = "gb_predictions.csv"
gb_predictions_df.to_csv(gb_file, index=False)

(ISSUE) MLP

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_hypo_scaled = scaler.transform(X_hypo)

mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=0)

mlp_accuracy = np.mean(cross_val_score(mlp_model, X_scaled, y, cv=10))

print(mlp_accuracy)

In [None]:
mlp_model.fit(X, y)

# Make predictions on the new data
mlp_predictions = mlp_model.predict(X_hypo_scaled)

# Convert the predictions to a DataFrame
mlp_predictions_df = pd.DataFrame(mlp_predictions, columns=y_hypo.columns)

# Display the predictions for new proteins
print("MLP Predictions for Hypothetical Proteins:")
print(mlp_predictions_df)

# Save the DataFrame to the CSV file
mlp_file = "mlp_predictions.csv"
mlp_predictions_df.to_csv(mlp_file, index=False)