### Import libs

In [None]:
!pip install -U keras_tuner

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
import pandas as pd
import numpy as np
%matplotlib inline
plt.style.use("ggplot")

In [None]:
import gensim
from sklearn.cluster import KMeans

### Util Functions

In [None]:
#Se Define la función para dibujar la matriz de confusión:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=None, cmap=plt.cm.Blues):

    title = 'confusion matrix'
    cm = confusion_matrix(y_true, y_pred, normalize=normalize)
    #classes = classes[unique_labels(y_true, y_pred)]
    
    #Matriz de Consufión calculada:
    print("Se imprime la matriz de Confusión")

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='Etiqueta Verdadera',
           xlabel='Etiqueta predicha')

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="red" if cm[i, j] > thresh else "red")
    fig.tight_layout()
    return ax

In [None]:
def plot_roc(y_test, y_score):
  fpr = dict()
  tpr = dict()
  roc_auc = dict()

  # Compute micro-average ROC curve and ROC area
  fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
  roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

  plt.figure()
  lw = 2
  plt.plot(fpr["micro"], tpr["micro"], color='darkorange',
          lw=lw, label='ROC curve (area = %0.2f)' % roc_auc["micro"])
  plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Receiver operating characteristic example')
  plt.legend(loc="lower right")
  plt.show()

### Load data

In [None]:
X_corpus = np.load("/content/drive/MyDrive/UNAL/Maestria/MD/Proyecto/X_corpus.npy")
#jd_df = pd.read_csv("/content/drive/MyDrive/UNAL/Maestria/MD/Proyecto/split_data.csv", index_col=0)

In [None]:
fname = '/content/drive/MyDrive/UNAL/Maestria/MD/Proyecto/doc2vec_model'
model_d2v = gensim.models.doc2vec.Doc2Vec.load(fname)
labels = KMeans(14, random_state=42).fit_predict(X_corpus)

In [None]:
import plotly.graph_objects as go
fig = go.Figure(
    data=[go.Bar(y=pd.Series(labels).value_counts(), x=pd.Series(labels).value_counts().index)],
    layout_title_text="Documentos por cluster"
)
fig.show()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

new_label_list =[["minimo vital", "seguridad social","debido proceso"],
["debido proceso", "providencias judiciales"],
["desplazamiento", "reparacion integral"],
["debido proceso", "acto administrativo"],
["libertad de expresion"],
["seguridad social", "servicios de salud"],
["seguridad social", "estabilidad laboral"	],
["servicio publico"],
["comunidades indigenas"],	
["establecimiento penitenciario"],
["minimo vital", "seguridad social", "pension"],
["minimo vital", "seguro"],	
["educacion", "spe"],	
["desplazamiento", "vivienda digna", "debido proceso"]]

enc = MultiLabelBinarizer()
encode_label_list = enc.fit_transform(new_label_list)
pass

In [None]:
from collections import Counter
target = np.empty((X_corpus.shape[0], encode_label_list.shape[-1]))
counter = Counter()

for i, label in enumerate(labels):
  target[i]=encode_label_list[label]
  counter += Counter(new_label_list[label])

In [None]:
fig = go.Figure(
    data=[go.Bar(y=list(counter.values()), x=list(counter.keys()))],
    layout_title_text="Categorias de documentos"
)
fig.show()

### Classification

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_corpus, target, test_size=0.2, stratify=target)

### Cross Validation and Hiperpametrer optimization

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB

In [None]:
import keras_tuner as kt
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import tree
from sklearn.metrics.pairwise import cosine_similarity, chi2_kernel, laplacian_kernel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

def build_model(hp):

  ensamble_model = hp.Choice('ensamble_model', ['random_forest', 'boosting', "bagging", "NA"])
  with hp.conditional_scope("ensamble_model", ["random_forest"]):
    if ensamble_model == 'random_forest':
      return Pipeline([('model',  OneVsRestClassifier(ensemble.RandomForestClassifier(
          random_state=42, n_estimators=hp.Int('n_estimators', 10, 50, step=10),
          max_depth=hp.Int('max_depth', 3, 10),
          criterion = hp.Choice('criterion', ['entropy', 'gini'])
      ))
      ), ]) 

  with hp.conditional_scope("ensamble_model", ['boosting', "bagging", "NA"]):
    model_type = hp.Choice('model_type', ['ridge', "gaussian", "dtree", "SVC", "knn"])
    if model_type == 'ridge':
      with hp.conditional_scope("model_type", ["ridge"]):
        model = linear_model.RidgeClassifier(
            alpha=hp.Float('alpha', 1e-3, 1, sampling='log'))
      
    elif model_type == 'dtree':
      with hp.conditional_scope("model_type", ["dtree"]):
        model = tree.DecisionTreeClassifier(random_state=42,
            max_depth=hp.Int('max_depth', 3, 10),
            criterion = hp.Choice('criterion', ['entropy', 'gini']))
    elif model_type == 'SVC':
      with hp.conditional_scope("model_type", ["SVC"]):
        kernel = hp.Choice('kernel', ['linear', 'rbf', 'cosine', 'chi2'])
        if kernel == 'chi2':
          model = SVC(kernel=laplacian_kernel, random_state=42)
        elif kernel == 'cosine':
          model = SVC(kernel=cosine_similarity, random_state=42)
        else:
          model = SVC(kernel=kernel, random_state=42)
    elif model_type == 'knn':
      with hp.conditional_scope("model_type", ["knn"]):
        model = KNeighborsClassifier(n_neighbors=hp.Int('n_neighbors', 5, 30, step=5))
    else:
      model = GaussianNB()
    
  with hp.conditional_scope("ensamble_model", ['boosting', "bagging"]):
    n_estimators = hp.Int('n_estimators', 10, 50, step=10)
    if ensamble_model == 'boosting':
        return  Pipeline([('model',  OneVsRestClassifier(ensemble.AdaBoostClassifier(model,
                      n_estimators=n_estimators, random_state=42, algorithm='SAMME'))), ])  

    elif ensamble_model == 'bagging':
      return  Pipeline([('model',  OneVsRestClassifier(ensemble.BaggingClassifier(model,
                      n_estimators=n_estimators, random_state=42))), ])  
      
  return  Pipeline([('model',  OneVsRestClassifier(model)), ])   

tuner = kt.tuners.SklearnTuner(
    oracle=kt.oracles.BayesianOptimization(
        objective=kt.Objective('score', 'max'),
        max_trials=120),
    hypermodel=build_model,
    scoring=metrics.make_scorer(metrics.f1_score, average="micro"),
    cv=model_selection.KFold(5),
    directory='/content/drive/MyDrive/UNAL/Maestria/MD/Proyecto/tunner/',
    project_name='classification')

INFO:tensorflow:Reloading Oracle from existing project /content/drive/MyDrive/UNAL/Maestria/MD/Proyecto/tunner/classification/oracle.json
INFO:tensorflow:Reloading Tuner from /content/drive/MyDrive/UNAL/Maestria/MD/Proyecto/tunner/classification/tuner0.json


In [None]:
tuner.search(X_train, y_train, sample_weight=None, )

INFO:tensorflow:Oracle triggered exit


In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
y_predict = best_model.predict(X_test)

## Results

In [None]:
tuner.results_summary()

Results summary
Results in /content/drive/MyDrive/UNAL/Maestria/MD/Proyecto/tunner/classification
Showing 10 best trials
Objective(name='score', direction='max')
Trial summary
Hyperparameters:
ensamble_model: NA
model_type: SVC
kernel: rbf
Score: 0.9255011149634248
Trial summary
Hyperparameters:
ensamble_model: boosting
model_type: gaussian
n_estimators: 30
Score: 0.8980057865221955
Trial summary
Hyperparameters:
ensamble_model: boosting
model_type: knn
n_estimators: 30
Score: 0.8980057865221955
Trial summary
Hyperparameters:
ensamble_model: boosting
model_type: gaussian
n_estimators: 40
Score: 0.8956418432995218
Trial summary
Hyperparameters:
ensamble_model: boosting
model_type: knn
n_estimators: 40
Score: 0.8956418432995218
Trial summary
Hyperparameters:
ensamble_model: boosting
model_type: knn
n_estimators: 20
Score: 0.8951271203682143
Trial summary
Hyperparameters:
ensamble_model: NA
model_type: SVC
kernel: cosine
Score: 0.8946678484535155
Trial summary
Hyperparameters:
ensamble_mo

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
y_predict = best_model.predict_proba(X_test)

In [None]:
y_probas = best_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict, target_names=enc.classes_))

                               precision    recall  f1-score   support

          acto administrativo       1.00      0.97      0.98        31
        comunidades indigenas       1.00      0.96      0.98        25
               debido proceso       0.95      0.91      0.93       247
               desplazamiento       1.00      0.93      0.96        68
                    educacion       1.00      0.86      0.93        44
          estabilidad laboral       1.00      0.92      0.96        51
establecimiento penitenciario       1.00      1.00      1.00        20
        libertad de expresion       1.00      0.80      0.89        15
                 minimo vital       0.96      0.94      0.95       303
                      pension       0.96      0.99      0.97       176
      providencias judiciales       0.98      0.95      0.97        66
          reparacion integral       1.00      0.91      0.95        32
             seguridad social       0.97      0.99      0.98       432
     

  _warn_prf(average, modifier, msg_start, len(result))


## References

https://colab.research.google.com/drive/1j_4UQTT0Lib8ueAU5zXECxesCj_ofjw7#scrollTo=UIhmFA6nSbPo
