<a href="https://colab.research.google.com/github/kmenesesrivera/codonusagebias/blob/main/3-Ajuste_de_Hiperparametros.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Ajuste de Hiperparametros

##Lectura de Datos

In [None]:
import urllib.request
import zipfile, urllib.request, shutil
from __future__ import absolute_import, division, print_function, unicode_literals


url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip'
filename = 'codon_usage.csv.zip'

In [None]:
with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)
    with zipfile.ZipFile(filename) as zf:
        zf.extractall()

In [None]:
!unzip codon_usage.csv

Archive:  codon_usage.csv
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
Archive:  codon_usage.csv.zip
replace codon_usage.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: codon_usage.csv         
replace __MACOSX/._codon_usage.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: __MACOSX/._codon_usage.csv  


In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
labelencoder = LabelEncoder()

In [100]:
#Lectura del dataset
file_name = "codon_usage.csv"
dataset =  pd.read_csv('https://drive.google.com/uc?export=download&id=1Z4v43cvTwp920NyOdboDKP7_ytC_0tBC')

# Se observa que hay datos str que no permiten manipular los demás como numéricos.
dataset[['UUU', 'UUC']] = dataset[['UUU', 'UUC']].apply(pd.to_numeric, errors='coerce')
null_UUU = dataset['UUU'].isna().sum()
null_UUC= dataset['UUC'].isna().sum()
print ("Cantidad de datos nulos en codon UUU ",null_UUU)
print ("Cantidad de datos nuls en codon UUC ",null_UUC)


Cantidad de datos nulos en codon UUU  2
Cantidad de datos nuls en codon UUC  1


  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
dataset.describe

<bound method NDFrame.describe of 0      Kingdom  DNAtype  SpeciesID  ...      UAA      UAG      UGA
0      Kingdom  DNAtype  SpeciesID  ...      UAA      UAG      UGA
1          vrl        0     100217  ...  0.00251    5e-04        0
2          vrl        0     100220  ...  0.00271  0.00068        0
3          vrl        0     100755  ...  0.00391        0  0.00144
4          vrl        0     100880  ...  0.00261  0.00157        0
...        ...      ...        ...  ...      ...      ...      ...
13024      pri        0       9601  ...  0.00091  0.00091  0.00638
13025      pri        1       9601  ...  0.00242  0.00097  0.01887
13026      pri        1       9602  ...  0.00356  0.00119  0.02017
13027      pri        0       9606  ...  0.00099  0.00079  0.00156
13028      pri        1       9606  ...  0.00156  0.00114  0.02161

[13029 rows x 69 columns]>

In [None]:
def preprocess_dataset(dataset, save_metadata=True):
  
  preprocessed_dataset = dataset.copy()

  #Determinamos aquellas variables que tengan un porcentaje de registros únicos por cada variable mayor al valor de 0.7
  threshold = 0.7

  unique_percentages = dataset.nunique() / len(dataset)

  criteria = unique_percentages > threshold

  columns_to_filter = unique_percentages[criteria].keys()
  
  # Está columna contiene datos descriptivos, por tanto será transformada a str.

  preprocessed_dataset['SpeciesName'] = preprocessed_dataset['SpeciesName'].astype(str)
  preprocessed_dataset['SpeciesName']
 
  #Eliminamos las variables seleccionadas en el paso anterior
  preprocessed_dataset.drop(columns_to_filter, axis=1, inplace=True)
 
  #Eliminamos los registros duplicados
  preprocessed_dataset.drop_duplicates(keep="first", inplace=True)

  #Determinamos que variables son del tipo númerico y cuales son categóricas
  numeric_columns = list()
  categorical_columns = list()
  dictionary_of_columns_with_index_to_categorical = dict()
  dictionary_of_columns_with_categorical_to_index = dict()

  for column in preprocessed_dataset:
    #Determinamos si la variable es numérica o no
    if pd.api.types.is_numeric_dtype(preprocessed_dataset[column]):
      numeric_columns.append(column)
    else:
      #Modificamos el tipo de dato de la variable mediante "astype"
      preprocessed_dataset[column] = preprocessed_dataset[column].astype("category")

      #Verificamos si el tipo de dato de la variable fue transformado a categórico correctamente
      if not pd.api.types.is_categorical_dtype(preprocessed_dataset[column]):
        raise Exception("La columna {} no se transformó correctamente a categórica".format(column))

      dictionary_of_columns_with_index_to_categorical[column] = dict()
      dictionary_of_columns_with_categorical_to_index[column] = dict()
      
      #Indexamos los valores (categorías), sin tomar en consideración los nulos, de la variable y guardamos esa información en los diccionarios
      for index, category in enumerate(preprocessed_dataset[column].cat.categories):
        dictionary_of_columns_with_index_to_categorical[column][index] = category
        dictionary_of_columns_with_categorical_to_index[column][category] = index
      
      categorical_columns.append(column)
  
  #Reemplazamos los nulos con la mediana sólo de aquellas variables numéricas
    median_of_numeric_columns = preprocessed_dataset[numeric_columns].median()
    preprocessed_dataset[numeric_columns] = preprocessed_dataset[numeric_columns].fillna(median_of_numeric_columns)

  #Transformamos a números los valores (categorías) de las variables categóricas sin considerar los nulos
  preprocessed_dataset.replace(dictionary_of_columns_with_categorical_to_index, inplace=True)

  #Determinamos aquellas variables que tengan un porcentaje de registros nulos por cada variable mayor al valor de 0.7
  threshold = 0.7

  null_percentages = preprocessed_dataset[categorical_columns].isna().sum() / len(preprocessed_dataset)

  criteria = null_percentages > threshold

  columns_to_filter = null_percentages[criteria].keys()

  #Eliminamos las variables seleccionadas en el paso anterior
  preprocessed_dataset.drop(columns_to_filter, axis=1, inplace=True)

  #Eliminamos los registros duplicados
  preprocessed_dataset.drop_duplicates(keep="first", inplace=True)
 
  return preprocessed_dataset

In [76]:
preprocessed_dataset = preprocess_dataset(dataset)

In [77]:
X1 = preprocessed_dataset.drop("Kingdom", axis=1)
X2 = preprocessed_dataset.drop("DNAtype", axis=1)
Y1 = preprocessed_dataset["Kingdom"]  
Y2 = preprocessed_dataset["DNAtype"] 

In [78]:
#Mediante el método "train_test_split" usaremos el 20% de la data para probar el modelo. El parámetro "random state" nos sirve para
#poder replicar la misma separación
x1_train, x1_test, y1_train, y1_test = train_test_split(X1, Y1, test_size=0.2, random_state=70)

In [79]:
#Mediante el método "train_test_split" usaremos el 20% de la data para probar el modelo. El parámetro "random state" nos sirve para
#poder replicar la misma separación
x2_train, x2_test, y2_train, y2_test = train_test_split(X2, Y2, test_size=0.2, random_state=70)

##Eliminas algunas caracteristicas 
Basandonos en el colab anterior decidimos usar solo 5 de las caracteristicas 

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif

In [116]:
filtrado = SelectKBest(mutual_info_classif, k=5).fit(X1, Y1)
X_new = filtrado.transform(x1_train)
X_new[:5]

array([[4247., 3000., 2830., 3652., 1432.],
       [4263., 4322., 2221., 3183., 2905.],
       [ 123.,  467., 1439.,  940.,  916.],
       [3585., 5129., 1948., 3523., 2630.],
       [ 151., 1115., 1498., 1610.,  482.]])

In [117]:
Xt_new = filtrado.transform(x1_test )
Xt_new[:5]

array([[ 266.,  883., 1363., 1572.,  759.],
       [3960., 3499., 2236., 3477., 1840.],
       [3546., 3139., 1588., 1734., 1960.],
       [4001., 4489., 2640., 3026., 2429.],
       [4611., 3609., 2967., 3879., 3011.]])

##Entrenamos cada modelo

In [111]:
from sklearn.model_selection import GridSearchCV

###Decision Tree

Decission Tree Base

In [121]:
dt = DecisionTreeClassifier()
dt.fit(X_new, y1_train)
print('Max Depth: ', dt.tree_.max_depth)
print('Score: ',dt.score(Xt_new,y1_test))
print('F1: ', f1_score(y1_test, dt.predict(Xt_new), average='weighted'))

Max Depth:  26
Score:  0.6760292420161601
F1:  0.6769358937906165


Busqueda por Grilla

In [123]:
params = {"max_depth": [10,11,12,13,14,15,16,17, None], "max_features": [3,4,5], "min_samples_leaf": [3,4,5,6,7], "criterion": ["gini", "entropy"]}
tree = DecisionTreeClassifier()
dt_cv = GridSearchCV(tree, params, cv=5)
dt_cv.fit(X_new, y1_train)
print("Mejores Parametros: {}".format(dt_cv.best_params_))
print("Mejor Score {}".format(dt_cv.best_score_))



Mejores Parametros: {'criterion': 'gini', 'max_depth': 10, 'max_features': 5, 'min_samples_leaf': 7}
Mejor Score 0.7042534979012361


Reentrenamos como los parametros encontrados

In [124]:
dt_final = DecisionTreeClassifier(max_depth=11, max_features=5, min_samples_leaf=7, criterion='gini')
dt_final.fit(X_new, y1_train)
print('Score: ',dt_final.score(Xt_new,y1_test))
print('F1: ', f1_score(y1_test, dt_final.predict(Xt_new), average='weighted'))

Score:  0.7079646017699115
F1:  0.6927558820382347


###Random Forest

Random Forest Base

In [125]:
rt = RandomForestClassifier(40, n_jobs=-1, oob_score=True)
rt.fit(X_new, y1_train)
print('Score: ',dt.score(Xt_new,y1_test))
print('F1: ', f1_score(y1_test, rt.predict(Xt_new), average='weighted'))

Score:  0.6760292420161601
F1:  0.7364526077184702


Busqueda por Grilla

In [127]:
params = {"n_estimators": [40,50,100,150,200], "max_depth": [10,11,12,13,14,15,16,17], "max_features": [3,4,5], "min_samples_leaf": [3,4,5,6,7]}
forest = RandomForestClassifier(oob_score=True)
rf_cv = GridSearchCV(forest, params, cv=5)
rf_cv.fit(X_new, y1_train)
print("Mejores Parametros: {}".format(rf_cv.best_params_))
print("Mejor Score {}".format(rf_cv.best_score_))



Mejores Parametros: {'max_depth': 10, 'max_features': 3, 'min_samples_leaf': 3, 'n_estimators': 40}
Mejor Score 0.7440845505330588


In [129]:
rf_final = RandomForestClassifier(oob_score=True, max_depth=16, max_features=4, min_samples_leaf=3, n_estimators=150)
rf_final.fit(X_new, y1_train)
print('Score: ',rf_final.score(Xt_new,y1_test))
print('F1: ', f1_score(y1_test, rf_final.predict(Xt_new), average='weighted'))

Score:  0.749134282416314
F1:  0.7324586841528481


###KNN

KNN BASE

In [131]:
knn = KNeighborsClassifier(4)
knn.fit(X_new, y1_train)
print('Score: ',knn.score(Xt_new,y1_test))
print('F1: ', f1_score(y1_test, knn.predict(Xt_new), average='weighted'))

Score:  0.7222008464794152
F1:  0.7131797116942324


In [132]:
params = {"n_neighbors": [3,4,5, 6,8,10], "weights": ['uniform', 'distance'], "metric": ['euclidean', 'manhattan']}
forest = KNeighborsClassifier()
knn_cv = GridSearchCV(forest, params, cv=5)
knn_cv.fit(X_new, y1_train)
print("Mejores Parametros: {}".format(knn_cv.best_params_))
print("Mejor Score {}".format(knn_cv.best_score_))



Mejores Parametros: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Mejor Score 0.7170496384163372


In [134]:
knn_final = KNeighborsClassifier(n_neighbors=4, metric='euclidean',weights='distance')
knn_final.fit(X_new, y1_train)
print('Score: ',knn_final.score(Xt_new,y1_test))
print('F1: ', f1_score(y1_test, knn_final.predict(Xt_new), average='weighted'))

Score:  0.7345132743362832
F1:  0.7259008097900985


###XGBoost

#### Y1

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

#Requiere de 65 características según la selección de características 
param_grid = {
 'max_depth':range(9,10),
 'subsample': [0.7,0.9]
}

xgb_Model = XGBClassifier(learning_rate=0.1, n_estimators=80, min_child_weight=1, subsample= 0.7, nthread=-1, n_jobs=-1,scoring='f1')

xgb_Grid = GridSearchCV (estimator= xgb_Model, param_grid = param_grid )
xgb_Grid.fit(x1_train,y1_train)

print ("optimal max_depth",xgb_Grid.best_estimator_.max_depth) 
print ("optimal xgb_Grid.best_score_",xgb_Grid.best_score_)
  



In [None]:
xgb_Grid.score(x1_test, y1_test)

#### Y2

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

#Requiere de 65 características según la selección de características 

param_grid = {
 'max_depth':range(9,10)
}

xgb_Model2 = XGBClassifier(learning_rate=0.01, n_estimators=80, min_child_weight=2, gamma=0, colsample_bytree=0.8, subsample= 0.7, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27, n_jobs=-1,scoring='roc_auc')
xgb_Grid2 = GridSearchCV (estimator= xgb_Model2, param_grid = param_grid )
 
xgb_Grid2.fit(x2_train,y2_train)
 
print ("optimal max_depth",xgb_Grid2.best_estimator_.max_depth)
print ("optimal xgb_Grid2.best_score_",xgb_Grid2.best_score_)
 

In [None]:
xgb_Grid2.score(x2_test, y2_test)

### Ligth GBM

In [None]:
import lightgbm as lgb 

#### Y1

In [None]:
#Declaración de parámetros para el modelo


params = {'boosting_type': 'gbdt',
          'max_depth' : 10,
          'nthread': -1, 
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample_freq': 1,
          'min_split_gain': 0.5,
          'min_child_weight': 2,
          'min_child_samples': 6,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'}

#Declaración de parámetros para GridSearchv

gridParams = {
    'learning_rate': [0.1],
    'n_estimators': [91],
    'num_leaves': [19],
    'objective' : ['binary'],
    'random_state' : [30], 
    'colsample_bytree' : [0.65],
    'subsample' : [1],
    'reg_alpha' : [1],
    'reg_lambda' : [1.2],
    }

#Requiere de 64 características según la selección de características 

#Declaración de Modelo
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = -1, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

mdl.get_params().keys()
#Ejecuciòn de GridSearchv
grid = GridSearchCV(mdl, gridParams,
                    verbose=0,
                    cv=4,
                    n_jobs=-1)
#Entrenamiento
grid.fit(x1_train, y1_train)

In [None]:
#Scores
print(grid.best_params_)
print(grid.best_score_)
print(grid.score(x1_test, y1_test))

#### Y2

In [None]:
#Declaración de parámetros para el modelo
params2 = {'boosting_type': 'gbdt',
          'max_depth' : 6,
          'objective': 'binary',
          'nthread': -1,
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 2,
          'min_child_samples': 6,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'}
          
#Declaración de parámetros para GridSearchv

gridParams2 = {
    'learning_rate': [0.1],
    'n_estimators': [90],
    'num_leaves': [16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [30],
    'colsample_bytree' : [0.65],
    'subsample' : [0.9],
    'reg_alpha' : [1],
    'reg_lambda' : [1],
    'max_depth' : [8]
    }

#Requiere de 64 características según la selección de características 

#Declaración de Modelo
mdl2 = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = -1, 
          silent = True,
          max_depth = params2['max_depth'],
          max_bin = params2['max_bin'],
          subsample_for_bin = params2['subsample_for_bin'],
          subsample = params2['subsample'],
          subsample_freq = params2['subsample_freq'],
          min_split_gain = params2['min_split_gain'],
          min_child_weight = params2['min_child_weight'],
          min_child_samples = params2['min_child_samples'],
          scale_pos_weight = params2['scale_pos_weight'])

mdl2.get_params().keys()

#Ejecuciòn de GridSearchv
grid2 = GridSearchCV(mdl2, gridParams2,
                    verbose=0,
                    cv=4,
                    n_jobs=-1)

#Entrenamiento
grid2.fit(x2_train, y2_train)

#Scores
print(grid2.best_params_)
print(grid2.best_score_) 
print(grid2.score(x2_test, y2_test))
