In [None]:
# Sólo código necesario

#Librerías y paquetes necesarios para el algoritmo.
import utils
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, ConfusionMatrixDisplay, confusion_matrix

#Carga de base de datos
wdf = pd.read_csv('corrected_FINAL_DDBB.csv', header = "infer")

#Eliminación de outliers, missing values y valores duplicados
x1=list(wdf.loc[(wdf["label"] == 1)]["residue_conserv"])
l = []

for i in x1:
    l.append(i)
    
# Remove outliers and plot results
new_l = sorted(l)[9:]

# Se eliminan los outliers de los datos originales.
wdf = wdf.drop(wdf.loc[(wdf["label"] == 1) & (wdf["residue_conserv"] <= 0.6197)].index)

# Check for duplicates 
wdf[wdf["mutation"].duplicated()]

# Check for missing values
wdf.isnull().sum()

# Aquí no se cómo hacer para que los duplicados y los nulos no nos 
#salgan por pantalla

#Se etiquetan los datos en 'benignos' y 'patógenos'
y_be = (wdf.values[:,-1] == 0)
y_pa = (wdf.values[:,-1] == 1)

#Se dividen los datos en 'training set' y 'test set', y se realiza 
#un oversampling sobre los datos benignos de 'training set'

X = wdf.values[:,2:-1]
y = wdf.values[:,-1].astype('int')

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1012)

# Create a df of X_train and y_train
X_train_dfos = pd.DataFrame(X_train, columns = ['initial_aa', 
                                                'final_aa', 
                                                'topological_domain', 
                                                'functional_domain', 
                                                'd_size',
                                                'd_hf',
                                                'd_vol',
                                                'd_msa',
                                                'd_charge', 
                                                'd_pol', 
                                                'd_aro', 
                                                'residue_conserv',
                                                'secondary_str',
                                                'pLDDT',
                                                'str_pos',
                                                'MTR'])

y_train_dfos = pd.DataFrame(y_train, columns = ['label'])

##################### Oversampling ###########################
# Concatenate our training data back together
dfos = pd.concat([X_train_dfos, y_train_dfos], axis=1)

# Separate majority and minority classes
# Separate majority and minority classes
df_majority = dfos[dfos.label==1]
df_minority = dfos[dfos.label==0]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority)//2,    
                                 random_state=0)   # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Oversampled train datasets for machine learning models
X_train_os = df_upsampled.values[:,:-1]
y_train_os = df_upsampled.values[:,-1].astype("int")

#Se codifican los datos

X_train_enc, X_test_enc, X_train_df, X_test_df = utils.categorical_encoding(X_train_os, X_test)
X_train_df.columns 

#Se implementa el modelo 'ensemble' que es combinación de los
#algoritmos LR1, SVC y RF

X_train_fs, X_test_fs, featEn, posEn = utils.select_features(X_train_enc,
                                                             X_train_df, 
                                                             y_train_os,
                                                             X_test_enc, 
                                                             n = 45)



pipeline_ensemble_soft = Pipeline( [("scaler", StandardScaler()), \
                                        ("Ensemble_soft", VotingClassifier(voting = "soft",
                                                                            weights = [1,0.5,1.75],
                                                                            estimators=[
                                         ("logistic", LogisticRegression(solver = "saga",
                                                                                    penalty = "l2",
                                                                                    max_iter = 10000,
                                                                                    class_weight = {0: 3, 1: 2},
                                                                                    multi_class = "ovr",
                                                                                    C = 2.91,
                                                                                    random_state = 8)),
                                       ("SVC", SVC(kernel = "linear", 
                                                  class_weight= {0:1, 1:1},
                                                                 probability=True,
                                                                 decision_function_shape = "ovr",
                                                                 degree = 2,
                                                                 gamma = "auto", 
                                                                 C = 1,
                                                                 random_state = 45)),
                                        ("RF", RandomForestClassifier(max_depth = 3,
                                                   criterion = "log_loss",
                                                   max_features = "log2",
                                                   oob_score = False,
                                                   min_samples_split = 2, # min = 5
                                                   class_weight= {0:3, 1:1},
                                                   random_state = 45))]))])
pipeline_ensemble_soft.fit(X_train_fs, y_train_os)


#print("AUC-ROC VALUES:")
#print(utils.get_roc_auc_score(pipeline_ensemble_soft, X_train_fs, y_train_os, 
                                  #X_test_fs, y_test, print_table=True))

#Se define un input para que el usuario introduzca la mutación que
#se quiere analizar, se crea un dataframe porque tal y como está
#escrito el código es lo más cómodo.

mut = input("Enter your mutation:")


# Create a dataframe with it 
conflictive = pd.DataFrame(columns = ["Mutationppt"])
c = pd.DataFrame({'Mutationppt':[mut]})
challenge = pd.concat([conflictive, c])

#Se generan los descriptores
# Use utils KNCQ2_DDBB_generation function to create all descriptors
ch_df = utils.KCNQ2_DDBB_generation(challenge)

#Quitamos los posibles valores duplicados.
#Ahora mismo esto no tiene mucho sentido ya que en el input sólo 
#podemos introducir las mutaciones de una en una, pero cuando 
#lo solucione será útil

# Check for duplicates 
ch_df[ch_df["Mutationppt"].duplicated()]

# Remove duplicates if needed
ch_df_clean = ch_df.drop_duplicates(subset = ["Mutationppt"])
rest = ch_df.shape[0] -ch_df_clean.shape[0]

# This variable contains all variants names
variants_names = list(ch_df_clean["Mutationppt"])

#Se adapta la tabla a los requerimientos del algoritmo
ch_df_clean = utils.preprocessing_ch(ch_df_clean)
# Convert into numpy array
X_ch = ch_df_clean.to_numpy()
X_train_enc, X_ch_enc, X_train_df, X_ch_df = utils.categorical_encoding(X_train_os, X_ch)


# Apply same feature selection as for ensemble algorithm
X_train_fs, X_ch_enc_fs, feat_pred, pos_pred = utils.select_features(X_train_enc,
                                                             X_train_df, 
                                                             y_train_os,
                                                             X_ch_enc,
                                                             n = 45)

# Make prediction with MLb-KCNQ2 algorithm
KCNQ2e_y_ch_p = pipeline_ensemble_soft.predict(X_ch_enc_fs)

# View probabilities in prediction
KCNQ2eprob = pipeline_ensemble_soft.predict_proba(X_ch_enc_fs)


#Se saca por pantalla el resultado de la predicción
if KCNQ2e_y_ch_p == 0:
    proba0 = KCNQ2eprob[0,0]*100
    print("Benign mutation")
    if proba0<=60:
        print("Success rate: VERY LOW")
    elif 60 < proba0 <=70:
        print("Success rate: LOW")
    elif 70 < proba0 <=80:
        print("Success rate: MODERATE")
    elif 80 < proba0 <=90:
        print("Success rate: HIGH")
    elif 90< proba0:
        print("Success rate: VERY HIGH")
else:
    proba1 = KCNQ2eprob[0,1]*100
    print("Pathogenic mutation")
    if proba1<=60:
        print("Success rate: VERY LOW")
    elif 60 < proba1 <=70:
        print("Success rate: LOW")
    elif 70 < proba1 <=80:
        print("Success rate: MODERATE")
    elif 80 < proba1 <=90:
        print("Success rate: HIGH")
    elif 90< proba1:
        print("Success rate: VERY HIGH")
    

