# SVM con *scorings*
Prueba de SVM con los scorings calculados con:
- 111 ligandos
    - 27 Activos
- 402 Conformaciones de la porteína CDK2
    - Conformaciones experimentales
- 3 progrmas de acoplamiento molecular

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pylab

Carga del DataFrame Principal:

In [401]:
data_dir = './B_DATOS'
df_cdk2_labels = pd.read_json(F"{data_dir}/TABLA_MTDATA_CDK2_402_crys_LIGS_INFO_LABELS_AUC_docking.json")
df_cdk2_labels.columns

Index(['Title', 'Date', 'Entities', 'ChainID', 'Resolution', 'Identity',
       'Coverage', 'NumGaps', 'GapLen', 'GapPos', 'NumLigs', 'NameLigs',
       'Inhib', 'Inhib_mass', 'Labels_conf', 'AUC_vrd_16x', 'AUC_vrd_8x',
       'AUC_vina', 'AUC_ad4_LC', 'AUC_ad4_LE'],
      dtype='object')

Carga de los dataframes con los mejores scores de las proteínas para cada programa de *docking*.

In [197]:
# Autodock 4. LE (lowest energy), LC (largest Cluster)
df_ad4_results_LE = pd.read_csv(F'./B_DATOS/vs_docking_crys_ensemble_AD4_LE.csv', index_col=0)
df_ad4_results_LC = pd.read_csv(F'./B_DATOS/vs_docking_crys_ensemble_AD4_LC.csv', index_col=0)
# Autodock Vina
df_vina_results = pd.read_csv(F'./B_DATOS/vs_docking_crys_ensemble_VINA.csv', index_col=0)
# Vinardo scoring 16x
df_vinardo_results = pd.read_csv(F'./B_DATOS/vs_docking_crys_ensemble_VINARDO.csv', index_col=0)
# vinardo scorings 8x
df_vinardo_results = pd.read_csv(F'./B_DATOS/vs_docking_crys_ensemble_VINARDO_ex8.csv', index_col=0)

## Vinardo

# SVM Lineal con Vinardo 16x

In [230]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, make_scorer

In [413]:
# Primer intento para vinardo a 16x
X = df_vinardo_results.drop(["ActiveInactive"], axis = 1)
y = (df_vinardo_results["ActiveInactive"] == "Active").astype(np.float32)

print(F'Dimensiones de X: {X.shape}')
print(F'Dimensiones de y: {y.shape}')

Dimensiones de X: (111, 402)
Dimensiones de y: (111,)


In [414]:
# Cambio en la versión del dataset X
# best_vinardo_confs = df_cdk2_labels[df_cdk2_labels.AUC_vrd_16x >= 0.68].index
best_vinardo_confs = df_cdk2_labels[df_cdk2_labels.Labels_conf != 'active'].index
X = X[best_vinardo_confs]

print(F'Dimensiones de X: {X.shape}')

Dimensiones de X: (111, 288)


## Pipeline

In [434]:
# Se establece el pipeline para el gridSearch
SVCpipe = Pipeline([
        ("scaler", StandardScaler()),
        ("SVC", LinearSVC(loss = 'hinge'))
    ])

parametros = {'SVC__C': [10**i for i in range(-2, 5)]}
print("C:", parametros["SVC__C"])
linearSVC = GridSearchCV(SVCpipe, parametros, 
                         scoring = 'roc_auc',
                         cv = 5, return_train_score = True)

# Split del dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)


C: [0.01, 0.1, 1, 10, 100, 1000, 10000]


In [429]:
# Entrenamiento con el grid
linearSVC.fit(X_DUD, y_DUD)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('SVC',
                                        LinearSVC(C=1.0, class_weight=None,
                                                  dual=True, fit_intercept=True,
                                                  intercept_scaling=1,
                                                  loss='hinge', max_iter=1000,
                                                  multi_class='ovr',
                                                  penalty='l2',
                                                  random_state=None, tol=0.0001,
                                                  verbose=0))],
            

In [430]:
# DEfinimos el rango de parámetros para el parámtro C
bestSVC = linearSVC.best_estimator_
bestSVC.score(X_train, y_train)

0.8068181818181818



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('SVC',
                 LinearSVC(C=0.1, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='hinge', max_iter=1000, multi_class='ovr',
                           penalty='l2', random_state=None, tol=0.0001,
                           verbose=0))],
         verbose=False)

In [433]:
bestSVC.score(X_test, y_test)

0.782608695652174

In [419]:
# Split de los sets de entrenamiento y prueba
df_vrd_DUD2006 = pd.read_csv(F'./B_DATOS/vs_docking_DUD2006_vs_402_crys_vinardo_8x.csv', index_col=0)
y_DUD = pd.Series(df_vrd_DUD2006.Actividad == "Active", dtype = int)
X_DUD = df_vrd_DUD2006.drop(["Actividad"], axis = 1)
X_DUD = X_DUD[best_vinardo_confs]

In [420]:
bestSVC.score(X_DUD, y_DUD)

0.8909599254426841

In [421]:
pred = bestSVC.predict(X_DUD)
print("Vinardo 8x:", roc_auc_score( y_true = y_DUD, y_score = pred ))

Vinardo 8x: 0.6888594235508412


In [368]:
# Primer intento para vinardo a 16x
X = df_vinardo_results.drop(["ActiveInactive"], axis = 1)
y = (df_vinardo_results["ActiveInactive"] == "Active").astype(np.float32)

print(F'Dimensiones de X: {X.shape}')
print(F'Dimensiones de y: {y.shape}')

Dimensiones de X: (111, 402)
Dimensiones de y: (111,)


## SVM lineal con las mejores conformaciones
Seleccionamos las conformaciones con un AUC en vinardo mayor a 0.75

In [20]:
best_vinardo_confs = df_cdk2_labels[df_cdk2_labels.AUC_vrd_16x >= 0.8].index
best_vinardo_confs.shape

df = df_vinardo_results[["ActiveInactive"] + best_vinardo_confs] 
X = df.iloc[:, 1:]
y = (df["ActiveInactive"] == "Active").astype(np.float32)
X.shape

KeyError: "None of [Index(['ActiveInactive1h01', 'ActiveInactive1h07', 'ActiveInactive1ke5',\n       'ActiveInactive1ke6', 'ActiveInactive1ogu', 'ActiveInactive1oiq',\n       'ActiveInactive1pxm', 'ActiveInactive1urw', 'ActiveInactive1vyz',\n       'ActiveInactive2btr', 'ActiveInactive2c4g', 'ActiveInactive2c6o',\n       'ActiveInactive2iw6', 'ActiveInactive2iw8', 'ActiveInactive2iw9',\n       'ActiveInactive2r3p', 'ActiveInactive2r64', 'ActiveInactive2vv9',\n       'ActiveInactive3dog', 'ActiveInactive3ig7', 'ActiveInactive3lfn',\n       'ActiveInactive3ns9', 'ActiveInactive3qu0', 'ActiveInactive3r9h',\n       'ActiveInactive3rk9', 'ActiveInactive3s0o', 'ActiveInactive3sqq',\n       'ActiveInactive3unj', 'ActiveInactive3unk', 'ActiveInactive4bzd',\n       'ActiveInactive4cfn', 'ActiveInactive4eor', 'ActiveInactive4fkj',\n       'ActiveInactive4fkq', 'ActiveInactive5if1', 'ActiveInactive5nev',\n       'ActiveInactive6guf'],\n      dtype='object')] are in the [columns]"

In [None]:
# Se establece el pipeline para el gridSearch
SVCpipe = Pipeline([
        ("scaler", StandardScaler()),
        ("SVC", LinearSVC(loss = 'hinge', max_iter = 10000))
    ])

parametros = {'SVC__C': [10**i for i in range(-2, 5)]}
print("C:", parametros["SVC__C"])

# Grid par CV
linearSVC = GridSearchCV(SVCpipe, parametros, cv = 5, return_train_score = True)

# split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)
linearSVC.fit(X_train, y_train)

## Intento con Vina (peor desempeño que vinardo)

In [None]:
# Primer intento para vinardo
df = df_vina_results
X = df.iloc[:, 1:]
y = (df["ActiveInactive"] == "Active").astype(np.float32)

# Se establece el pipeline para el gridSearch
SVCpipe = Pipeline([
        ("scaler", StandardScaler()),
        ("SVC", LinearSVC(loss = 'hinge', max_iter = 10000))
    ])

parametros = {'SVC__C': [10**i for i in range(-2, 5)]}
print("C:", parametros["SVC__C"])

# Grid par CV
linearSVC = GridSearchCV(SVCpipe, parametros, cv = 5, return_train_score = True)

# split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)
linearSVC.fit(X_train, y_train)

In [None]:
# Mejor modelo para vina
bestSVC_vina = linearSVC.best_estimator_
# Seleccionamos el mejor modelo
linearSVC.best_params_

In [None]:
# Entrenando el mejor modelo nuevamente
bestSVC_vina.fit(X_train, y_train)

In [None]:
# Exactitud Train
bestSVC_vina.score(X_train, y_train)

In [None]:
# Exactitud Test
bestSVC.score(X_test, y_test)