## SMOTE

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE

datos = pd.read_csv('file.csv')

X = datos.drop(['PRED'], axis=1)
y = datos['PRED']

# Aplicar SMOTE para balancear las clases antes de dividir los datos en conjuntos de entrenamiento y prueba
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new DataFrame with the balanced data
balanced_data = pd.DataFrame(X_resampled, columns=X.columns)
balanced_data['PRED'] = y_resampled

# Save the balanced data to a CSV file
balanced_data.to_csv('file_results.csv', index=False)

## Molecular descriptor computation

In [None]:
import pandas as pd
from propy import PyPro
from propy.GetProteinFromUniprot import GetProteinSequence
from propy.GetProteinFromUniprot import GetProteinSequenceFromTxt as gpst
from Bio import SeqIO

desc_var = 'file'
paac_list = []
container = open(f'{desc_var}.fasta','r')
for record in SeqIO.parse(container, "fasta"):#file
    DesObject = PyPro.GetProDes(record.seq)

    calc = DesObject.GetPAAC(lamda=5,weight=0.05)
    #calc = DesObject.GetDPComp()

    
    paac_list.append(list(calc.values()))

# Create a pandas DataFrame from the results
df = pd.DataFrame(paac_list)

# Generate custom column names with numbers and letters
column_names = []
for i in range(len(df.columns)):
    column_name = f"Desc_{i+1}"
    column_names.append(column_name)

# Assign custom column names to the DataFrame
df.columns = column_names

# Write the DataFrame to a CSV file with custom column names
df.to_csv(f'{desc_var}.csv', index=False, header=True)

## Model assessments

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import StackingClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import svm
from sklearn.linear_model import LogisticRegression

datos = pd.read_csv('file_results.csv')

X = datos.drop(['PRED'], axis=1)
y = datos['PRED']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42).fit(X_train, y_train)

#model1 = XGBClassifier(random_state=42)

#model2 = lgb.LGBMClassifier(random_state=42)

#model = StackingClassifier(estimators=[('rf', model1), ('ann', model2)]).fit(X_train, y_train)

import joblib
joblib.dump(model, 'model.pkl')

# Perform cross-validation
scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']
scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=10, return_train_score=True)

# Extract the average scores
accuracy = scores['test_accuracy'].mean()
F1_score = scores['test_f1_macro'].mean()
precision = scores['test_precision_macro'].mean()
sensitivity_recall = scores['test_recall_macro'].mean()

print("Accuracy:", accuracy)
print("F1_score:", F1_score)
print("Precision:", precision)
print("Sensitivity/Recall:", sensitivity_recall)


In [None]:
# Realizar predicciones en los datos de prueba
y_pred = model.predict(X_test)

# Calcular las métricas en los datos de prueba
accuracy_test = accuracy_score(y_test, y_pred)
f1_score_test = f1_score(y_test, y_pred, average='macro')
precision_test = precision_score(y_test, y_pred, average='macro')
recall_test = recall_score(y_test, y_pred, average='macro')

# Imprimir las métricas en los datos de prueba
print("Accuracy (Testing):", accuracy_test)
print("F1_score (Testing):", f1_score_test)
print("Precision (Testing):", precision_test)
print("Sensitivity/Recall (Testing):", recall_test)

In [None]:
# Perform cross-validation predictions
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_predict

y_train_probs = cross_val_predict(model, X_train, y_train, cv=10, method='predict_proba')
# Compute the AUC for the training set
auc_train = roc_auc_score(y_train, y_train_probs, multi_class='ovr')
print("Training AUC:", auc_train)

In [None]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for the test set
y_probs = model.predict_proba(X_test)

# Compute the AUC
auc = roc_auc_score(y_test, y_probs, multi_class='ovr')  # assuming multi-class classification

print("Testing AUC:", auc)