In [0]:
from google.colab import drive
drive.mount("/content/drive")

In [0]:
!pip install transformers==2.5.1
!pip install simpletransformers==0.20.3
!pip install seqeval==0.0.12
!pip install torch==1.4+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html --no-cache-dir
!rm -rf .git
!rm -rf apex
!git clone https://github.com/NVIDIA/apex
!git checkout 494f8ab3fc1b0b26949a3bcbb2bcac78008d48c1
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [0]:
#This is the path where the files named CsvReader, HelperFunctions etc. are saved
cd /content/drive/My\ Drive/Colab\ Notebooks

In [0]:
import numpy as np
import pandas as pd
import torch
import copy
import sklearn
import csv
from sklearn.metrics import label_ranking_average_precision_score, label_ranking_loss, multilabel_confusion_matrix,classification_report, coverage_error, hamming_loss

from CsvReader import ReadCsv
from HelperFunctions import ShuffleData, replace_all, ReplaceCategoriesWithIndex,ReplaceCategoriesWithIndexOneHot, OneHotEncodingForCategories, TransformDataIntoDataframe, getMetricsMulti, CalculateWeights
from TrainEvalModel import TrainModelForMultiLabel, EvalFromMultiLabelModel
from NonMLMethods import LogisticRegressionMulti

#This is the path to csvs that should be read.
folderPath = '/content/drive/My Drive/Colab Notebooks Pascal/bert-etiki/etiki-data'
data, categories, tendencies = ReadCsv(folderPath, 'etikidata.csv','companies.csv', 'categories.csv','references.csv','tendencies.csv', 'topics.csv')
for k in range(1,3):
  for i in range(2,7):
    simpleTransformersData = data[:,[13,4,13]]
    baselineData = ReplaceCategoriesWithIndex(categories, data[:,[13,4,13]],False)
    
    baselineData = ShuffleData(baselineData)
    simpleTransformersData = ShuffleData(simpleTransformersData)  

    singleLabelTrainData = np.array(OneHotEncodingForCategories(ReplaceCategoriesWithIndexOneHot(categories,np.array(simpleTransformersData[:int(len(simpleTransformersData)*0.7)]))))    
    multiLabelTestData = OneHotEncodingForCategories(ReplaceCategoriesWithIndex(categories,simpleTransformersData[int(len(simpleTransformersData)*0.7):],True))

    train_single_df = TransformDataIntoDataframe(singleLabelTrainData)
    eval_multi_df = TransformDataIntoDataframe(multiLabelTestData)

    eval_labels = np.array(eval_multi_df["label"].tolist())
#------------------------------XLNET------------------------------
    algo = 'xlnet'

    args = {'reprocess_input_data': True,
            'overwrite_output_dir': True,
            'num_train_epochs': i,
            'silent':True,
            'use_cached_eval_features': True,
            }
    model = TrainModelForMultiLabel(algo, 'xlnet-base-cased',train_single_df,5,args)
    # Evaluate the model
    result, model_outputs, wrong_predictions = EvalFromMultiLabelModel(model, eval_multi_df)

    lrap = label_ranking_average_precision_score(eval_labels, model_outputs)
    lrl = label_ranking_loss(eval_labels, model_outputs)
    ce = coverage_error(eval_labels, model_outputs)
    
    with open('results/categories/'+str(i)+' epoch/'+algo+'-metrics.csv', 'a', newline='') as f:
      writer = csv.writer(f)
      writer.writerow([lrap,lrl,ce])

    for j in range(1,10):
      threshold = 0.1*j
      op = []
      for output in model_outputs:
        li = output
        res = [1 if el > threshold else 0 for el in li]
        op.append(res)  
      out = np.array(op)
      cm = multilabel_confusion_matrix(eval_multi_labels, out)
      getMetricsMulti(cm, algo, 'categories/'+str(i)+' epoch/'+'{0:3.1f}'.format(0.1*j)+' threshold')
#------------------------------BERT------------------------------    
    algo = 'bert'
    args = {'reprocess_input_data': True,
            'overwrite_output_dir': True,
            'num_train_epochs': i,
            'silent':True,
            'use_cached_eval_features': True,
            }
    model = TrainModelForMultiLabel(algo, 'bert-base-cased',train_single_df,5,args)
    # Evaluate the model
    result, model_outputs, wrong_predictions = EvalFromMultiLabelModel(model, eval_multi_df)

    lrap = label_ranking_average_precision_score(eval_labels, model_outputs)
    lrl = label_ranking_loss(eval_labels, model_outputs)
    ce = coverage_error(eval_labels, model_outputs)
    
    with open('results/categories/'+str(i)+' epoch/'+algo+'-metrics.csv', 'a', newline='') as f:
      writer = csv.writer(f)
      writer.writerow([lrap,lrl,ce])

    for j in range(1,10):
      threshold = 0.1*j
      op = []
      for output in model_outputs:
        li = output
        res = [1 if el > threshold else 0 for el in li]
        op.append(res)  
      out = np.array(op)
      cm = multilabel_confusion_matrix(eval_multi_labels, out)
      getMetricsMulti(cm, algo, 'categories/'+str(i)+' epoch/'+'{0:3.1f}'.format(0.1*j)+' threshold')
#----------------------------RoBERTa------------------------------  
    algo = 'roberta'
    args = {'reprocess_input_data': True,
            'overwrite_output_dir': True,
            'num_train_epochs': i,
            'silent':True,
            'use_cached_eval_features': True,
            }
    model = TrainModelForMultiLabel(algo, 'roberta-base',train_single_df,5,args)
    # Evaluate the model
    result, model_outputs, wrong_predictions = EvalFromMultiLabelModel(model, eval_multi_df)

    lrap = label_ranking_average_precision_score(eval_labels, model_outputs)
    lrl = label_ranking_loss(eval_labels, model_outputs)
    ce = coverage_error(eval_labels, model_outputs)
    
    with open('results/categories/'+str(i)+' epoch/'+algo+'-metrics.csv', 'a', newline='') as f:
      writer = csv.writer(f)
      writer.writerow([lrap,lrl,ce])

    for j in range(1,10):
      threshold = 0.1*j
      op = []
      for output in model_outputs:
        li = output
        res = [1 if el > threshold else 0 for el in li]
        op.append(res)  
      out = np.array(op)
      cm = multilabel_confusion_matrix(eval_multi_labels, out)
      getMetricsMulti(cm, algo, 'categories/'+str(i)+' epoch/'+'{0:3.1f}'.format(0.1*j)+' threshold')
#--------------------------DistilBERT-----------------------------   
    algo = 'distilbert'
    args = {'reprocess_input_data': True,
            'overwrite_output_dir': True,
            'num_train_epochs': i,
            'silent':True,
            'use_cached_eval_features': True,
            }
    model = TrainModelForMultiLabel(algo, 'distilbert-base-cased',train_single_df,5,args)
    # Evaluate the model
    result, model_outputs, wrong_predictions = EvalFromMultiLabelModel(model, eval_multi_df)

    lrap = label_ranking_average_precision_score(eval_labels, model_outputs)
    lrl = label_ranking_loss(eval_labels, model_outputs)
    ce = coverage_error(eval_labels, model_outputs)
    
    with open('results/categories/'+str(i)+' epoch/'+algo+'-metrics.csv', 'a', newline='') as f:
      writer = csv.writer(f)
      writer.writerow([lrap,lrl,ce])

    for j in range(1,10):
      threshold = 0.1*j
      op = []
      for output in model_outputs:
        li = output
        res = [1 if el > threshold else 0 for el in li]
        op.append(res)  
      out = np.array(op)
      cm = multilabel_confusion_matrix(eval_multi_labels, out)
      getMetricsMulti(cm, algo, 'categories/'+str(i)+' epoch/'+'{0:3.1f}'.format(0.1*j)+' threshold')
#-----------------------Logistic Regression-----------------------
    algo = 'LogisticRegression'

    baselineTrainData = ReplaceCategoriesWithIndexOneHot(categories,baselineData[:int(len(baselineData)*0.7)])
    baselineTestData = ReplaceCategoriesWithIndex(categories,baselineData[int(len(baselineData)*0.7):],True) 
   
    model_outputs, label_test = LogisticRegressionMulti(baselineTrainData, baselineTestData)
    
    eval_labels = np.array(label_test)
  
    lrap = label_ranking_average_precision_score(eval_labels, model_outputs)
    lrl = label_ranking_loss(eval_labels, model_outputs)
    ce = coverage_error(eval_labels, model_outputs)

    with open('results/categories/'+str(i)+' epoch/'+algo+'-metrics.csv', 'a', newline='') as f:
      writer = csv.writer(f)
      writer.writerow([lrap,lrl,ce])

    for j in range(1,10):
      threshold = 0.1*j
      op = []
      for output in model_outputs:
        li = output
        res = [1 if el > threshold else 0 for el in li]
        op.append(res)  
      out = np.array(op)
      cm = multilabel_confusion_matrix(eval_multi_labels, out)
      getMetricsMulti(cm, algo, 'categories/'+str(i)+' epoch/'+'{0:3.1f}'.format(0.1*j)+' threshold')