In [0]:
!pip install tiny-tokenizer

In [0]:
!pip install flair

In [0]:
!pip install allennlp



In [0]:
import numpy as np
import pandas as pd
from flair.embeddings import Sentence
from flair.embeddings import FlairEmbeddings, BertEmbeddings, ELMoEmbeddings
from flair.embeddings import DocumentPoolEmbeddings
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from google.colab import files
from datetime import datetime

In [0]:
bert_embedding = BertEmbeddings()
bert_train_document_embeddings = DocumentPoolEmbeddings([bert_embedding])
bert_test_document_embeddings = DocumentPoolEmbeddings([bert_embedding])

In [0]:
#requires allennlp
elmo_embedding = ELMoEmbeddings()
elmo_train_document_embeddings = DocumentPoolEmbeddings([elmo_embedding])
elmo_test_document_embeddings = DocumentPoolEmbeddings([elmo_embedding])

In [0]:
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_train_document_embeddings = DocumentPoolEmbeddings([flair_embedding_forward])
flair_test_document_embeddings = DocumentPoolEmbeddings([flair_embedding_forward])

In [0]:
uploaded = files.upload()

In [0]:
data = pd.read_csv("finalDataset.csv")

In [0]:
# BERT

for count in range (30):
  start_time = datetime.now()
  train_sentences = []
  train_labels = []
  test_sentences = []
  test_labels = []
  for row in data.itertuples():
    count_id = int(row.no)
    sentence = Sentence(row.text)
    label = row.label
    if count_id == count:
      test_sentences.append(sentence)
      test_labels.append(label)
    else:
      train_sentences.append(sentence)
      train_labels.append(label)

  #Training Embeddings:
  train_embeddings = []
  # since processing requires some memory we provide sentences into the document embedder in batches (small parts)
  for n in range(0, 5000, 250):  
    sents = train_sentences[n:n+250]
    bert_train_document_embeddings.embed(sents)
    train_embeddings += [np.array(sentence.get_embedding().detach()) for sentence in sents]

  #Test Embeddings:
  test_embeddings = []
  for n in range(0, 5000, 250):  
    sents = test_sentences[n:n+250]
    bert_test_document_embeddings.embed(sents)
    test_embeddings += [np.array(sentence.get_embedding().detach()) for sentence in sents]

  clf_b = DecisionTreeClassifier(max_depth=5, random_state=0)
  clf_b.fit(train_embeddings[:5000], train_labels[:5000])

  predicted_labels = clf_b.predict(test_embeddings[0:5000])
  end_time = datetime.now()
  
  print('Bert - round', count+1, ': Accuracy =',
        accuracy_score(predicted_labels[0:len(test_labels)], test_labels[0:len(test_labels)]),
        '\t F1 =', f1_score(predicted_labels[0:len(test_labels)], test_labels[0:len(test_labels)],
                           average='weighted',labels=np.unique(predicted_labels)),
         '\t Time =', end_time-start_time)

print('**********************************************************************')

# ELMo

for count in range (30):
  start_time = datetime.now()
  train_sentences = []
  train_labels = []
  test_sentences = []
  test_labels = []
  for row in data.itertuples():
    count_id = int(row.no)
    sentence = Sentence(row.text)
    label = row.label
    if count_id == count:
      test_sentences.append(sentence)
      test_labels.append(label)
    else:
      train_sentences.append(sentence)
      train_labels.append(label)

  #Training Embeddings:
  train_embeddings = []
  # since processing requires some memory we provide sentences into the document embedder in batches (small parts)
  for n in range(0, 5000, 250):  
    sents = train_sentences[n:n+250]
    elmo_train_document_embeddings.embed(sents)
    train_embeddings += [np.array(sentence.get_embedding().detach()) for sentence in sents]

  #Test Embeddings:
  test_embeddings = []
  for n in range(0, 5000, 250):  
    sents = test_sentences[n:n+250]
    elmo_test_document_embeddings.embed(sents)
    test_embeddings += [np.array(sentence.get_embedding().detach()) for sentence in sents]

  clf_f = DecisionTreeClassifier(max_depth=5, random_state=0)
  clf_f.fit(train_embeddings[:5000], train_labels[:5000])

  predicted_labels = clf_f.predict(test_embeddings[0:5000])
  end_time = datetime.now()
  print('ELMo - round', count+1, ': Accuracy =',
        accuracy_score(predicted_labels[0:len(test_labels)], test_labels[0:len(test_labels)]),
        '\t F1 =', f1_score(predicted_labels[0:len(test_labels)], test_labels[0:len(test_labels)],
                           average='weighted',labels=np.unique(predicted_labels)),
         '\t Time =', end_time-start_time)
  print('**********************************************************************')

  # Flair

for count in range (30):
  start_time = datetime.now()
  train_sentences = []
  train_labels = []
  test_sentences = []
  test_labels = []
  for row in data.itertuples():
    count_id = int(row.no)
    sentence = Sentence(row.text)
    label = row.label
    if count_id == count:
      test_sentences.append(sentence)
      test_labels.append(label)
    else:
      train_sentences.append(sentence)
      train_labels.append(label)

  #Training Embeddings:
  train_embeddings = []
  # since processing requires some memory we provide sentences into the document embedder in batches (small parts)
  for n in range(0, 5000, 250):  
    sents = train_sentences[n:n+250]
    flair_train_document_embeddings.embed(sents)
    train_embeddings += [np.array(sentence.get_embedding().detach()) for sentence in sents]

  #Test Embeddings:
  test_embeddings = []
  for n in range(0, 5000, 250):  
    sents = test_sentences[n:n+250]
    flair_test_document_embeddings.embed(sents)
    test_embeddings += [np.array(sentence.get_embedding().detach()) for sentence in sents]

  clf_f = DecisionTreeClassifier(max_depth=5, random_state=0)
  clf_f.fit(train_embeddings[:5000], train_labels[:5000])

  predicted_labels = clf_f.predict(test_embeddings[0:5000])
  end_time = datetime.now()
  print('Flair - round', count+1, ': Accuracy =',
        accuracy_score(predicted_labels[0:len(test_labels)], test_labels[0:len(test_labels)]),
        '\t F1 =', f1_score(predicted_labels[0:len(test_labels)], test_labels[0:len(test_labels)],
                           average='weighted',labels=np.unique(predicted_labels)),
         '\t Time =', end_time-start_time)
  
  print('**********************************************************************')


Bert - round 1 : Accuracy = 0.19161676646706588 	 F1 = 0.24987725698029228 	 Time = 0:11:09.161743
Bert - round 2 : Accuracy = 0.17964071856287425 	 F1 = 0.2422916732495663 	 Time = 0:10:58.541371
Bert - round 3 : Accuracy = 0.17365269461077845 	 F1 = 0.2272430346919666 	 Time = 0:10:48.293570
Bert - round 4 : Accuracy = 0.15568862275449102 	 F1 = 0.20711688807646048 	 Time = 0:10:56.013347
Bert - round 5 : Accuracy = 0.16766467065868262 	 F1 = 0.21699559067843097 	 Time = 0:10:49.275651
Bert - round 6 : Accuracy = 0.19760479041916168 	 F1 = 0.24300271506270563 	 Time = 0:10:43.144606
Bert - round 7 : Accuracy = 0.2275449101796407 	 F1 = 0.28556666428477734 	 Time = 0:10:45.909573
Bert - round 8 : Accuracy = 0.20359281437125748 	 F1 = 0.2500518025532513 	 Time = 0:10:41.883465
Bert - round 9 : Accuracy = 0.17365269461077845 	 F1 = 0.23147999900377503 	 Time = 0:10:43.765307
Bert - round 10 : Accuracy = 0.23952095808383234 	 F1 = 0.33411393241391785 	 Time = 0:10:42.660516
Bert - round 