In [None]:
!pip install transformers==2.5.1
!pip install simpletransformers==0.20.3
!pip install seqeval==0.0.12
!pip install torch==1.4+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html --no-cache-dir
!rm -rf .git
!rm -rf apex
!git clone https://github.com/NVIDIA/apex
!git checkout 494f8ab3fc1b0b26949a3bcbb2bcac78008d48c1
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
cd /content/drive/My\ Drive/Colab\ Notebooks

In [None]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import torch
import copy
import sklearn
import csv
import tensorflow as tf
import datetime, os
from sklearn.metrics import label_ranking_average_precision_score, label_ranking_loss, multilabel_confusion_matrix,classification_report, coverage_error, hamming_loss

from CsvReader import ReadCsv
from HelperFunctions import ShuffleData, replace_all, ReplaceCategoriesWithIndex,ReplaceCategoriesWithIndexOneHot, OneHotEncodingForCategories, TransformDataIntoDataframe, getMetricsMulti, CalculateWeights
from TrainEvalModel import TrainModelForMultiLabel, EvalFromMultiLabelModel
from NonMLMethods import LogisticRegressionMulti

folderPath = '/content/drive/My Drive/Colab Notebooks Pascal/bert-etiki/etiki-data'
data, categories, tendencies = ReadCsv(folderPath, 'etikidata.csv','companies.csv', 'categories.csv','references.csv','tendencies.csv', 'topics.csv')
for k in range(1,3):
  for i in range(2,7):
    rawData = data[:,[13,4,13]]
    
    multiLabelData = OneHotEncodingForCategories(ReplaceCategoriesWithIndex(categories,rawData,True))
    np.random.shuffle(multiLabelData)   

    trainData = multiLabelData[:int(len(multiLabelData)*0.7)]  
    testData =  multiLabelData[int(len(multiLabelData)*0.7):]

    train_multi_df = TransformDataIntoDataframe(trainData)
    eval_multi_df = TransformDataIntoDataframe(testData)

    rawSingleLabelData = data[:,[13,4,13]]

    singleLabelTrainData = np.array(OneHotEncodingForCategories(ReplaceCategoriesWithIndexOneHot(categories,np.array(rawSingleLabelData[:int(len(rawSingleLabelData)*0.7)]))))    
    singleLabelTestData = OneHotEncodingForCategories(ReplaceCategoriesWithIndex(categories,rawSingleLabelData[int(len(rawSingleLabelData)*0.7):],True))

    train_single_df = TransformDataIntoDataframe(singleLabelTrainData)
    eval_single_df = TransformDataIntoDataframe(singleLabelTestData)

    eval_multi_labels = np.array(eval_multi_df["label"].tolist())
    eval_single_labels = np.array(eval_single_df["label"].tolist())
    
    distribution = np.array(OneHotEncodingForCategories(ReplaceCategoriesWithIndexOneHot(categories,np.array(rawSingleLabelData))))
    distribution_df = TransformDataIntoDataframe(distribution)

    weightCalc = [np.array(x) for x in distribution_df["label"]]

    y_integers = np.argmax(weightCalc, axis=1)

    class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers).tolist()
#------------------------------RoBERTa------------------------------
    algo = 'roberta'

    args = {'reprocess_input_data': True,
            'overwrite_output_dir': True,
            'num_train_epochs': 6,
            'silent':True,
            'use_cached_eval_features': False,
            }
    model = TrainModelForMultiLabel(algo, 'roberta-base',train_multi_df,5,args,class_weights)
    # Evaluate the model
    result, model_outputs, wrong_predictions = EvalFromMultiLabelModel(model, eval_multi_df)

    lrap = label_ranking_average_precision_score(eval_multi_labels, model_outputs)
    lrl = label_ranking_loss(eval_multi_labels, model_outputs)
    ce = coverage_error(eval_multi_labels, model_outputs)
    
    with open('results/categories/no-cached-weighted-combined/'+algo+'1-metrics.csv', 'a', newline='') as f:
      writer = csv.writer(f)
      writer.writerow([lrap,lrl,ce])

    for j in range(1,10):
      threshold = 0.1*j
      op = []
      for output in model_outputs:
        li = output
        res = [1 if el > threshold else 0 for el in li]
        op.append(res)  
      out = np.array(op)
      cm = multilabel_confusion_matrix(eval_multi_labels, out)
      getMetricsMulti(cm,algo+'1', 'categories/no-cached-weighted-combined/'+'{0:3.1f}'.format(0.1*j)+' threshold')

#----------------------------XLNet------------------------------  
    algo = 'xlnet'
    args = {'reprocess_input_data': True,
            'overwrite_output_dir': True,
            'num_train_epochs': 4,
            'silent':True,
            'use_cached_eval_features': False,
            }
    model = TrainModelForMultiLabel(algo, 'xlnet-base-cased',train_single_df,5,args,class_weights)
    # Evaluate the model
    result, model_outputs, wrong_predictions = EvalFromMultiLabelModel(model, eval_single_df)

    lrap = label_ranking_average_precision_score(eval_single_labels, model_outputs)
    lrl = label_ranking_loss(eval_single_labels, model_outputs)
    ce = coverage_error(eval_single_labels, model_outputs)
    
    with open('results/categories/no-cached-weighted-not-combined/'+algo+'1-metrics.csv', 'a', newline='') as f:
      writer = csv.writer(f)
      writer.writerow([lrap,lrl,ce])

    for j in range(1,10):
      threshold = 0.1*j
      op = []
      for output in model_outputs:
        li = output
        res = [1 if el > threshold else 0 for el in li]
        op.append(res)  
      out = np.array(op)
      cm = multilabel_confusion_matrix(eval_single_labels, out)
      getMetricsMulti(cm,algo+'1', 'categories/no-cached-weighted-not-combined/'+'{0:3.1f}'.format(0.1*j)+' threshold')