In [1]:
# Train number of different models from Flair framework.
# With different sized trainin data
# save predictions of each model to file

# Notice - 1st run may take long as model weights are downloaded

In [2]:
# Dataset
# https://github.com/t-davidson/hate-speech-and-offensive-language

# Paper
# https://aaai.org/ocs/index.php/ICWSM/ICWSM17/paper/view/15665

# Their code
# https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/src/Automated%20Hate%20Speech%20Detection%20and%20the%20Problem%20of%20Offensive%20Language%20Python%203.6.ipynb

In [3]:
# Code based on https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md

In [5]:
import pandas as pd
import random
import os
import numpy as np
import torch

import nltk
import string
import re
from nltk.stem.porter import *
import time

# Display whole text of dataframe field and don't cut it
pd.set_option('display.max_colwidth', -1)

In [6]:
print(f'torch version: {torch.__version__}')

torch version: 1.3.1


In [7]:
dataset = 'hatespeech'

current = os.getcwd()
basefolder = current + '/dataset_'+ dataset+'/'
datafolder = basefolder + 'data/'  # for example /dataset_businessnews/data/
print(basefolder)

infolder =  basefolder + 'input/'
outfolder = basefolder + 'output/'

/home/max/git/modelcompare/dataset_hatespeech/


In [8]:
from flair.data import Sentence
from flair.data_fetcher import NLPTaskDataFetcher

from flair.embeddings import WordEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings
from flair.embeddings import DocumentPoolEmbeddings
from flair.embeddings import FlairEmbeddings, BertEmbeddings, ELMoEmbeddings
from flair.embeddings import BytePairEmbeddings

from flair.embeddings import OpenAIGPTEmbeddings
#from flair.embeddings import OpenAIGPT2Embeddings

# New DocumentRNNEmbeddings, deprecates DocumentLSTMembeddings
# from flair.embeddings import #DocumentLSTMEmbeddings

from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

# new ones: GPT-1 and GPT-2
# https://github.com/flairNLP/flair/tree/master/resources/docs/embeddings
# https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
SEED = 1
# REPEATABILITY
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything() # also called here

# TEXT PREPROCESS
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x: # comparison makes faster
            x = x.replace(punct, f' {punct} ')
    return x

quotes = ['″', '′', '"'] # apostrophe "'"
def mark_quotes(x):
    x = str(x)
    for quote in quotes:
        if quote in x: # comparison makes faster
            x = x.replace(quote, f'quote')
    return x

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    mention_regex = '@[\w\-]+'
    
    #add #, mention, e.g. &#8120     
    mention_regex2  =   '&#[0-9]*' 
    
    
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, ' URL ', parsed_text)
    parsed_text = re.sub(mention_regex, ' MENTION', parsed_text)
    parsed_text = re.sub(mention_regex2, ' MENTION', parsed_text)    
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    # *needed to be removed or outputs a list of letters
    #tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    tweet = " ".join(re.split("[^a-zA-Z.,!?]", tweet.lower())).strip()  
    #tweet = " ".join(re.split(r'\s+', tweet.lower())).strip()
    return tweet.split()

In [10]:
def loadData():
    train = pd.read_csv(basefolder+'input/train.csv',sep='\t', header = None)
    dev = pd.read_csv(basefolder+'input/dev.csv'  ,sep='\t', header = None)
    test = pd.read_csv(basefolder+'input/test.csv'  ,sep='\t', header = None)    
    train.columns = ['id','label','text']
    dev.columns  = ['id','label', 'text']
    test.columns  = ['id','label', 'text']   
    return train, dev, test

In [11]:
def preprosess_hatespeech(df):
    df.text = df.text.apply(lambda x: preprocess(x)) #URL, @mention etc
    df["text"] = df["text"].apply(lambda x: clean_text(x))
    df["text"] = df["text"].apply(lambda x: mark_quotes(x))
    #df.text = df.text.apply(lambda x: basic_tokenize(x))
    return df

In [12]:
# Turn label from digit into Fasttext format __label__.  "1" into "__label__1"
def toFasttext(df):
    df['label'] = '__label__' + df['label'].astype(str)
    return df

In [13]:
data_folder = infolder

In [14]:
# randomize

# Load data
train, dev, test = loadData()

train = train.iloc[np.random.permutation(len(train))]


#Test smaller
trainsize = len(train)

'''SET TRAINSIZE HERE'''
# 100, 200, 500, 1k, 3k, 7k, 18k
trainsize = 100
train = train[0:trainsize]

print(len(train))
print(len(dev))
print(len(test))

100
3000
3000


### Preprocess

In [15]:
train = preprosess_hatespeech(train)
dev = preprosess_hatespeech(dev)
test = preprosess_hatespeech(test)

In [16]:
train = toFasttext(train)
dev = toFasttext(dev)
test = toFasttext(test)

In [17]:
train.head()

Unnamed: 0,id,label,text
15131,18202,__label__1,RT MENTION : MENTION That has redneck written all over it lol . Drunks can use gravity to get down MENTION ;
4106,3986,__label__2,"MENTION her face ugly to me , & amp ; her nudes were trash ."
3365,4483,__label__1,MENTION come draw and paint with me niggah !
6004,12574,__label__1,Krakin this bitch ipem at 12 . . . WHO SIPPIN WIT ME . . . ELEMENTS URL
11261,10768,__label__1,I hate when bitches quote been thinking quote smh


In [18]:
# Id in start of line is not Fasttext format: remove id
train.drop(['id'], axis=1, inplace=True)
dev.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

# Write to Flairs input csv files
train.to_csv(basefolder+'input/flair_train.csv',sep='\t', index = False, header = False)
dev.to_csv(basefolder+'input/flair_dev.csv'  ,sep='\t', index = False, header = False)
test.to_csv(basefolder+'input/flair_test.csv',sep='\t', index = False, header = False)

### Flair

### Stacked embeddings


In [19]:
# Embeddings
# https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md

# 'multi-forward', multi-lang English, German, French, Italian, Dutch, Polish, 
#        Mix of corpora (Web, Wikipedia, Subtitles, News)

# 'mix-forward'English,   Forward LM embeddings over mixed corpus (Web, Wikipedia, Subtitles)

`StackedEmbeddings` are currently a `WordEmbeddings` class, so they cannot directly be used to classify 
documents. They can only be used for sequence labeling.

However, you can put a stack of word embeddings into one of the `DocumentEmbeddings` classes such as `DocumentPoolEmbeddings` or `DocumentLSTMEmbeddings`. This way, you are specifying how to aggregate word embeddings for text classification

So `DocumentPoolEmbeddings` will simply average them, while `DocumentLSTMEmbeddings` will train an LSTM over them.

 https://github.com/zalandoresearch/flair/issues/414
 
 *update depracated: DocumentLSTMEmbeddings. (The functionality of this class is moved to 'DocumentRNNEmbeddings')

In [20]:
# OR average 
# document_embeddings = DocumentPoolEmbeddings(word_embeddings)

#### Model

In [21]:
# https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f

# https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

In [22]:
#data_folder = infolder
data_folder

'/home/max/git/modelcompare/dataset_hatespeech/input/'

In [23]:
corpus = NLPTaskDataFetcher.load_classification_corpus(data_folder, test_file='flair_test.csv', dev_file='flair_dev.csv', train_file='flair_train.csv')

2020-06-23 11:58:35,988 Reading data from /home/max/git/modelcompare/dataset_hatespeech/input
2020-06-23 11:58:35,989 Train: /home/max/git/modelcompare/dataset_hatespeech/input/flair_train.csv
2020-06-23 11:58:35,991 Dev: /home/max/git/modelcompare/dataset_hatespeech/input/flair_dev.csv
2020-06-23 11:58:35,992 Test: /home/max/git/modelcompare/dataset_hatespeech/input/flair_test.csv


  """Entry point for launching an IPython kernel.
  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


In [24]:
print(corpus)
#print(len(corpus.train))

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()
#print(label_dict)

Corpus: 100 train + 3000 dev + 3000 test sentences
2020-06-23 11:58:51,988 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 79724.46it/s]

2020-06-23 11:58:52,005 [b'1', b'2', b'0']





In [25]:
### Individual model only, skip when in batch

In [26]:
# classifier = TextClassifier(stacked_embeddings, label_dictionary=corpus.make_label_dictionary(),
#                            multi_label=False)

In [27]:
# Glove alone 0.902

In [28]:
# Winners: Fasttext: en-crawl, 0.9129 solo

# word embeddings pooled:glove+fasttext 'test_score': 0.8892,

# glove, fasttext:en-crawl, flair-multi-forward, elmo('original') f1-score 0.9155
# ELMO(original) test_score': 0.9002 (f-score 0.9145 )
# flair-news-forward WITHOUT TOKENIZATION, 'test_score': 0.8526
# flair-news-forward: 'test_score': 0.8741
# BERT only + RNN 256, bidir=True, 'test_score': 0.904,
# Tweet + Bert same, not very great, used 128 rnn?
#.
# 0.8911 flair + BPE


##############################

#NEW BEST adding preprocess puncts+quotes, glove, en-twitter, en-crawl:   test_score': 0.9242,
# no stemming


# BEST glove, en-twitter, en-crawl, flair-multi-forward:  f1-score 0.9129
# add stemming f-score 0.7808   Breaking down, very bad score



# ---------------------

# Added Preprosessing, no stemming
# 'test_score': 0.9072 glove, en-twitter , flair-multi-forward

# Add to this, en-crawl Fastext  # f1-score 0.9129 BEST

# Adding Kominov embedding: 'test_score': 0.8868 -> weaker considerably, by 0.03


# ----------------------------
# Compare single embeddings
# en-twitter          'test_score': 0.8943
# en-crawl, Fasttext 'test_score': 0.9064,
# flai-multi-forward: 'test_score': 0.7738  WEAK! WHY? Also slow


# ----------------------------

# Glove, en-twitter, en-crawl: 'test_score': 0.8865,  (20 epoch)
# Glove, en-twitter, en-crawl: same + 2-way FLAIR  'test_score': 0.8854 - weaker, but 15 epoch

# bert only, no glove: slow, 'test_score': 0.7329
# Flair only, no glove: 'test_score': 0.8158  (0.06 lower than with Glove)

# Glove, Flair-multi 2ways, Bert-base-cased : OOM

# test_score': 0.8763, Glove + doc-embedding multi 

# test_score': 0.8731, Glove + doc-embedding news


In [29]:
# trainer.train('./', max_epochs=10)

### Predict

In [30]:
def saveResults(savelist, name='all_default'):
    import shelve
    # file to be used
    filename= name+'.shlf'
    shelf = shelve.open(outfolder+filename)
    #shelf = shelve.open("all_flair.shlf")
    # serializing
    #shelf["all_flair"] = all_flair
    shelf[name] = savelist
    shelf.close() # you must close the shelve file!!!

In [31]:
def loadResults(name='all_default'):
    import shelve
    filename= name+'.shlf'
    shelf = shelve.open(outfolder+filename) 
    new = shelf[name]
    shelf.close()
    return new

In [32]:
''' Train with each embedding in the list, predict, add results to the list

Parameters: word_embeddings,    eg   WordEmbeddings('glove')
            modelname and modeldesc - text to be added in results
            savelist : list where results are appended. Can be empty or already including results
            epohcs: epochs to run

'''


def train_and_predict(embeddings, modelname, modeldesc, savelist, epochs=15, batch_size=32):
    
    print(modelname)
    start = time.time()
    
# PREPARE
    document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512,
                                           bidirectional = False,
                                           rnn_type='LSTM', 
                                           reproject_words=True, reproject_words_dimension=256                                                    
                                           )

    seed_everything(SEED)
    corpus = NLPTaskDataFetcher.load_classification_corpus(data_folder, test_file='flair_test.csv', dev_file='flair_dev.csv', train_file='flair_train.csv')

    seed_everything(SEED)
    label_dict = corpus.make_label_dictionary()

    seed_everything(SEED)
    classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(),
                            multi_label=False)
    trainer = ModelTrainer(classifier, corpus)
    
# TRAINING
    seed_everything(SEED)
    trainer.train('./', 
              learning_rate=0.1,
              mini_batch_size=batch_size, # 32  # BERT OOM even with 16 batch -> need 8. others run on 32 or even more
              anneal_factor=0.5,
              patience=5,     
              max_epochs=epochs,  #15
              ) #max_epochs=150

    duration_train = time.time()-start
    
# PREDICT
    start_pred = time.time()

    # turn text into Flairs "Sentence object"
    test['flair_sentence'] = test['text'].apply(lambda x: Sentence(x))

    # discard output, result is put into object itself
    _ = test['flair_sentence'].apply(lambda x: classifier.predict(x))

    # sentence.labels returns a list containing flairs Label object that includes a dict. 
    # dig the values for predicted label + confidence from within the dict
    # the 'value' returns a str, cast it to int
    test['yhat'] = test['flair_sentence'].apply(lambda x: int(x.labels[0].to_dict()['value']))

    test['confidence'] = test['flair_sentence'].apply(lambda x: x.labels[0].to_dict()['confidence'])

    results = pd.DataFrame(test[['yhat', 'confidence']])
    results.columns=['label','confidence']
    results.head()
    
# ADD RESULTS TO LIST

    duration_predict = time.time() - start_pred
    #print(f'Duration {duration:.2f} s')

    savelist.append({'model': modelname,
                'labels': results['label'],
                'confidence': results['confidence'],
                'traintime': duration_train,
                'predtime3k': duration_predict,
                'modeldesc': modeldesc
               }
              )

### function mode


In [33]:
savelist = [] # list to save results
modeldesc = '512LSTM_15epoch_non-bi'
#BATCH_SIZE=32
EPOCHS = 15

#### To use ELMoEmbeddings, please first install with "pip install allennlp"

In [34]:
total_time = time.time()

word_embeddings = [ WordEmbeddings('glove'),              ]
modelname = 'glove'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS)

word_embeddings = [ WordEmbeddings('en-crawl'),                 ]
modelname = 'fasttext web-crawl'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS)

word_embeddings = [ WordEmbeddings('en'),                 ]
modelname = 'fasttext news/wiki'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS)

word_embeddings = [ WordEmbeddings('en-twitter'),                 ]
modelname = 'en-twitter'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS)

word_embeddings = [ ELMoEmbeddings('original')              ]
modelname = 'elmo'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS)

print(time.time() - total_time)

# 1289 sec on 100 train (was 2x elmo)


# 2403 sec on 18k
# 2684 s   18 k

glove
2020-06-23 11:59:33,094 Reading data from /home/max/git/modelcompare/dataset_hatespeech/input
2020-06-23 11:59:33,095 Train: /home/max/git/modelcompare/dataset_hatespeech/input/flair_train.csv
2020-06-23 11:59:33,096 Dev: /home/max/git/modelcompare/dataset_hatespeech/input/flair_dev.csv
2020-06-23 11:59:33,097 Test: /home/max/git/modelcompare/dataset_hatespeech/input/flair_test.csv


  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-06-23 11:59:34,817 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 168988.88it/s]

2020-06-23 11:59:34,821 [b'1', b'2', b'0']
2020-06-23 11:59:34,821 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 96799.08it/s]

2020-06-23 11:59:34,825 [b'1', b'2', b'0']
2020-06-23 11:59:34,829 ----------------------------------------------------------------------------------------------------
2020-06-23 11:59:34,831 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=256, bias=True)
    (rnn): LSTM(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): CrossEntropyLoss()
)"
2020-06-23 11:59:34,831 ----------------------------------------------------------------------------------------------------
2020-06-23 11:59:34,832 Corpus: "Corpus: 100 train + 3000 dev + 3000 test sentences"
2020-06-23 11:59:34,833 ----------------------------------------------------------------------------------------------------
2020-06-23 11:59:34,834 Para




2020-06-23 11:59:35,096 epoch 1 - iter 1/4 - loss 0.98181307 - samples/sec: 458.72
2020-06-23 11:59:35,166 epoch 1 - iter 2/4 - loss 0.95769475 - samples/sec: 503.09
2020-06-23 11:59:35,183 epoch 1 - iter 3/4 - loss 0.90562998 - samples/sec: 3110.85
2020-06-23 11:59:35,194 ----------------------------------------------------------------------------------------------------
2020-06-23 11:59:35,195 EPOCH 1 done: loss 0.9056 - lr 0.1000
2020-06-23 11:59:39,585 DEV : loss 0.7174628973007202 - score 0.7667
2020-06-23 11:59:39,718 BAD EPOCHS (no improvement): 0


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-06-23 11:59:42,639 ----------------------------------------------------------------------------------------------------
2020-06-23 11:59:42,683 epoch 2 - iter 0/4 - loss 0.70180404 - samples/sec: 769.43


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-06-23 11:59:42,728 epoch 2 - iter 1/4 - loss 0.77091384 - samples/sec: 889.32
2020-06-23 11:59:42,775 epoch 2 - iter 2/4 - loss 0.76606268 - samples/sec: 801.06
2020-06-23 11:59:42,792 epoch 2 - iter 3/4 - loss 0.67171406 - samples/sec: 3102.44
2020-06-23 11:59:42,798 ----------------------------------------------------------------------------------------------------
2020-06-23 11:59:42,799 EPOCH 2 done: loss 0.6717 - lr 0.1000
2020-06-23 11:59:44,594 DEV : loss 0.6841868162155151 - score 0.7667
2020-06-23 11:59:44,726 BAD EPOCHS (no improvement): 1
2020-06-23 11:59:47,907 ----------------------------------------------------------------------------------------------------
2020-06-23 11:59:47,951 epoch 3 - iter 0/4 - loss 0.82007998 - samples/sec: 757.97
2020-06-23 11:59:47,998 epoch 3 - iter 1/4 - loss 0.72963786 - samples/sec: 849.77
2020-06-23 11:59:48,047 epoch 3 - iter 2/4 - loss 0.68627578 - samples/sec: 810.01
2020-06-23 11:59:48,063 epoch 3 - iter 3/4 - loss 0.77841528 - sa

2020-06-23 12:00:18,943 DEV : loss 0.6587111353874207 - score 0.7673
2020-06-23 12:00:19,079 BAD EPOCHS (no improvement): 1
2020-06-23 12:00:22,376 ----------------------------------------------------------------------------------------------------
2020-06-23 12:00:22,417 epoch 13 - iter 0/4 - loss 0.62206751 - samples/sec: 834.50
2020-06-23 12:00:22,470 epoch 13 - iter 1/4 - loss 0.61146039 - samples/sec: 782.17
2020-06-23 12:00:22,522 epoch 13 - iter 2/4 - loss 0.62978804 - samples/sec: 801.99
2020-06-23 12:00:22,539 epoch 13 - iter 3/4 - loss 0.70366386 - samples/sec: 3076.91
2020-06-23 12:00:22,550 ----------------------------------------------------------------------------------------------------
2020-06-23 12:00:22,551 EPOCH 13 done: loss 0.7037 - lr 0.0500
2020-06-23 12:00:24,551 DEV : loss 0.7141181230545044 - score 0.7213
2020-06-23 12:00:24,683 BAD EPOCHS (no improvement): 2
2020-06-23 12:00:24,685 ------------------------------------------------------------------------------

  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-06-23 12:00:45,396 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 162759.18it/s]

2020-06-23 12:00:45,399 [b'1', b'2', b'0']
2020-06-23 12:00:45,400 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 176527.95it/s]

2020-06-23 12:00:45,404 [b'1', b'2', b'0']
2020-06-23 12:00:45,406 ----------------------------------------------------------------------------------------------------
2020-06-23 12:00:45,408 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('en-crawl')
    )
    (word_reprojection_map): Linear(in_features=300, out_features=256, bias=True)
    (rnn): LSTM(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): CrossEntropyLoss()
)"
2020-06-23 12:00:45,408 ----------------------------------------------------------------------------------------------------
2020-06-23 12:00:45,409 Corpus: "Corpus: 100 train + 3000 dev + 3000 test sentences"
2020-06-23 12:00:45,410 ----------------------------------------------------------------------------------------------------
2020-06-23 12:00:45,410 P




2020-06-23 12:00:45,625 epoch 1 - iter 2/4 - loss 1.05136242 - samples/sec: 532.06
2020-06-23 12:00:45,640 epoch 1 - iter 3/4 - loss 1.00603278 - samples/sec: 3371.88
2020-06-23 12:00:45,646 ----------------------------------------------------------------------------------------------------
2020-06-23 12:00:45,646 EPOCH 1 done: loss 1.0060 - lr 0.1000
2020-06-23 12:00:49,757 DEV : loss 0.8139700889587402 - score 0.7667
2020-06-23 12:00:49,895 BAD EPOCHS (no improvement): 0


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-06-23 12:01:00,538 ----------------------------------------------------------------------------------------------------
2020-06-23 12:01:00,584 epoch 2 - iter 0/4 - loss 0.79311574 - samples/sec: 725.07
2020-06-23 12:01:00,623 epoch 2 - iter 1/4 - loss 0.82000515 - samples/sec: 979.06
2020-06-23 12:01:00,675 epoch 2 - iter 2/4 - loss 0.80578156 - samples/sec: 796.47
2020-06-23 12:01:00,694 epoch 2 - iter 3/4 - loss 0.72391286 - samples/sec: 2958.68
2020-06-23 12:01:00,700 ----------------------------------------------------------------------------------------------------
2020-06-23 12:01:00,702 EPOCH 2 done: loss 0.7239 - lr 0.1000
2020-06-23 12:01:02,619 DEV : loss 0.6994582414627075 - score 0.7667
2020-06-23 12:01:02,765 BAD EPOCHS (no improvement): 1
2020-06-23 12:01:13,604 ----------------------------------------------------------------------------------------------------
2020-06-23 12:01:13,665 epoch 3 - iter 0/4 - loss 0.79923809 - samples/sec: 546.64
2020-06-23 12:01:13,707

2020-06-23 12:03:12,194 ----------------------------------------------------------------------------------------------------
2020-06-23 12:03:12,195 EPOCH 12 done: loss 0.6108 - lr 0.0500
2020-06-23 12:03:14,097 DEV : loss 0.6591921448707581 - score 0.7667
2020-06-23 12:03:14,236 BAD EPOCHS (no improvement): 5
2020-06-23 12:03:25,160 ----------------------------------------------------------------------------------------------------
2020-06-23 12:03:25,201 epoch 13 - iter 0/4 - loss 0.63435638 - samples/sec: 811.54
2020-06-23 12:03:25,262 epoch 13 - iter 1/4 - loss 0.61320490 - samples/sec: 650.55
2020-06-23 12:03:25,320 epoch 13 - iter 2/4 - loss 0.63303649 - samples/sec: 692.31
2020-06-23 12:03:25,335 epoch 13 - iter 3/4 - loss 0.70154411 - samples/sec: 3585.07
2020-06-23 12:03:25,341 ----------------------------------------------------------------------------------------------------
2020-06-23 12:03:25,342 EPOCH 13 done: loss 0.7015 - lr 0.0500
2020-06-23 12:03:27,537 DEV : loss 0.6

  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-06-23 12:04:31,926 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 89910.05it/s]

2020-06-23 12:04:31,931 [b'1', b'2', b'0']
2020-06-23 12:04:31,932 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 78855.12it/s]

2020-06-23 12:04:31,937 [b'1', b'2', b'0']
2020-06-23 12:04:31,939 ----------------------------------------------------------------------------------------------------
2020-06-23 12:04:31,940 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('en')
    )
    (word_reprojection_map): Linear(in_features=300, out_features=256, bias=True)
    (rnn): LSTM(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): CrossEntropyLoss()
)"
2020-06-23 12:04:31,941 ----------------------------------------------------------------------------------------------------
2020-06-23 12:04:31,941 Corpus: "Corpus: 100 train + 3000 dev + 3000 test sentences"
2020-06-23 12:04:31,942 ----------------------------------------------------------------------------------------------------
2020-06-23 12:04:31,942 Paramet




2020-06-23 12:04:32,150 epoch 1 - iter 2/4 - loss 1.03748167 - samples/sec: 484.59
2020-06-23 12:04:32,166 epoch 1 - iter 3/4 - loss 1.00551321 - samples/sec: 3251.08
2020-06-23 12:04:32,172 ----------------------------------------------------------------------------------------------------
2020-06-23 12:04:32,172 EPOCH 1 done: loss 1.0055 - lr 0.1000
2020-06-23 12:04:35,456 DEV : loss 0.8411567211151123 - score 0.7667
2020-06-23 12:04:35,601 BAD EPOCHS (no improvement): 0


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-06-23 12:04:46,609 ----------------------------------------------------------------------------------------------------
2020-06-23 12:04:46,651 epoch 2 - iter 0/4 - loss 0.82936764 - samples/sec: 790.17
2020-06-23 12:04:46,697 epoch 2 - iter 1/4 - loss 0.84591985 - samples/sec: 907.07
2020-06-23 12:04:46,737 epoch 2 - iter 2/4 - loss 0.83511021 - samples/sec: 978.25
2020-06-23 12:04:46,755 epoch 2 - iter 3/4 - loss 0.75467433 - samples/sec: 3250.22
2020-06-23 12:04:46,761 ----------------------------------------------------------------------------------------------------
2020-06-23 12:04:46,762 EPOCH 2 done: loss 0.7547 - lr 0.1000
2020-06-23 12:04:48,973 DEV : loss 0.7208600044250488 - score 0.7667
2020-06-23 12:04:49,117 BAD EPOCHS (no improvement): 1
2020-06-23 12:05:00,112 ----------------------------------------------------------------------------------------------------
2020-06-23 12:05:00,164 epoch 3 - iter 0/4 - loss 0.81361187 - samples/sec: 638.82
2020-06-23 12:05:00,222

2020-06-23 12:06:59,392 ----------------------------------------------------------------------------------------------------
2020-06-23 12:06:59,393 EPOCH 12 done: loss 0.6757 - lr 0.0500
2020-06-23 12:07:01,329 DEV : loss 0.6699227094650269 - score 0.7667
2020-06-23 12:07:01,473 BAD EPOCHS (no improvement): 5
2020-06-23 12:07:12,329 ----------------------------------------------------------------------------------------------------
2020-06-23 12:07:12,369 epoch 13 - iter 0/4 - loss 0.67127281 - samples/sec: 829.26
2020-06-23 12:07:12,414 epoch 13 - iter 1/4 - loss 0.65598273 - samples/sec: 845.23
2020-06-23 12:07:12,463 epoch 13 - iter 2/4 - loss 0.66497407 - samples/sec: 778.49
2020-06-23 12:07:12,483 epoch 13 - iter 3/4 - loss 0.74663119 - samples/sec: 2883.61
2020-06-23 12:07:12,491 ----------------------------------------------------------------------------------------------------
2020-06-23 12:07:12,492 EPOCH 13 done: loss 0.7466 - lr 0.0500
2020-06-23 12:07:14,701 DEV : loss 0.6

  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-06-23 12:08:19,878 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 166111.05it/s]

2020-06-23 12:08:19,881 [b'1', b'2', b'0']
2020-06-23 12:08:19,882 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 140795.70it/s]

2020-06-23 12:08:19,885 [b'1', b'2', b'0']
2020-06-23 12:08:19,888 ----------------------------------------------------------------------------------------------------
2020-06-23 12:08:19,889 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('en-twitter')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=256, bias=True)
    (rnn): LSTM(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): CrossEntropyLoss()
)"
2020-06-23 12:08:19,889 ----------------------------------------------------------------------------------------------------
2020-06-23 12:08:19,889 Corpus: "Corpus: 100 train + 3000 dev + 3000 test sentences"
2020-06-23 12:08:19,891 ----------------------------------------------------------------------------------------------------
2020-06-23 12:08:19,891




2020-06-23 12:08:20,106 epoch 1 - iter 2/4 - loss 0.85201408 - samples/sec: 563.93
2020-06-23 12:08:20,123 epoch 1 - iter 3/4 - loss 0.84898101 - samples/sec: 3002.57
2020-06-23 12:08:20,130 ----------------------------------------------------------------------------------------------------
2020-06-23 12:08:20,131 EPOCH 1 done: loss 0.8490 - lr 0.1000
2020-06-23 12:08:23,423 DEV : loss 0.7212163805961609 - score 0.7667
2020-06-23 12:08:23,570 BAD EPOCHS (no improvement): 0


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-06-23 12:08:33,701 ----------------------------------------------------------------------------------------------------
2020-06-23 12:08:33,761 epoch 2 - iter 0/4 - loss 0.67330611 - samples/sec: 554.64
2020-06-23 12:08:33,806 epoch 2 - iter 1/4 - loss 0.71372572 - samples/sec: 954.63
2020-06-23 12:08:33,854 epoch 2 - iter 2/4 - loss 0.71139894 - samples/sec: 770.93
2020-06-23 12:08:33,877 epoch 2 - iter 3/4 - loss 0.62098975 - samples/sec: 2951.79
2020-06-23 12:08:33,883 ----------------------------------------------------------------------------------------------------
2020-06-23 12:08:33,884 EPOCH 2 done: loss 0.6210 - lr 0.1000
2020-06-23 12:08:35,968 DEV : loss 0.6901137232780457 - score 0.7667
2020-06-23 12:08:36,116 BAD EPOCHS (no improvement): 1
2020-06-23 12:08:46,092 ----------------------------------------------------------------------------------------------------
2020-06-23 12:08:46,150 epoch 3 - iter 0/4 - loss 0.89927065 - samples/sec: 565.48
2020-06-23 12:08:46,205

2020-06-23 12:10:08,095 EPOCH 12 done: loss 0.5459 - lr 0.1000
2020-06-23 12:10:10,034 DEV : loss 0.6181123852729797 - score 0.7693
2020-06-23 12:10:10,180 BAD EPOCHS (no improvement): 2
2020-06-23 12:10:10,181 ----------------------------------------------------------------------------------------------------
2020-06-23 12:10:10,217 epoch 13 - iter 0/4 - loss 0.59038860 - samples/sec: 930.22
2020-06-23 12:10:10,265 epoch 13 - iter 1/4 - loss 0.56778693 - samples/sec: 767.01
2020-06-23 12:10:10,315 epoch 13 - iter 2/4 - loss 0.58584915 - samples/sec: 836.27
2020-06-23 12:10:10,336 epoch 13 - iter 3/4 - loss 0.73433064 - samples/sec: 3136.73
2020-06-23 12:10:10,343 ----------------------------------------------------------------------------------------------------
2020-06-23 12:10:10,343 EPOCH 13 done: loss 0.7343 - lr 0.1000
2020-06-23 12:10:12,357 DEV : loss 0.7417793869972229 - score 0.6743
2020-06-23 12:10:12,505 BAD EPOCHS (no improvement): 3
2020-06-23 12:10:12,507 ---------------

  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-06-23 12:11:11,670 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 219138.14it/s]

2020-06-23 12:11:11,673 [b'1', b'2', b'0']
2020-06-23 12:11:11,673 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 255750.24it/s]

2020-06-23 12:11:11,675 [b'1', b'2', b'0']
2020-06-23 12:11:11,678 ----------------------------------------------------------------------------------------------------
2020-06-23 12:11:11,679 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): ELMoEmbeddings(model=0-elmo-original)
    )
    (word_reprojection_map): Linear(in_features=3072, out_features=256, bias=True)
    (rnn): LSTM(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): CrossEntropyLoss()
)"
2020-06-23 12:11:11,680 ----------------------------------------------------------------------------------------------------
2020-06-23 12:11:11,682 Corpus: "Corpus: 100 train + 3000 dev + 3000 test sentences"
2020-06-23 12:11:11,682 ----------------------------------------------------------------------------------------------------
2020-06-23 12




2020-06-23 12:11:11,945 epoch 1 - iter 0/4 - loss 1.20414293 - samples/sec: 129.35
2020-06-23 12:11:12,243 epoch 1 - iter 1/4 - loss 0.90223247 - samples/sec: 112.48
2020-06-23 12:11:12,467 epoch 1 - iter 2/4 - loss 0.88235418 - samples/sec: 146.83
2020-06-23 12:11:12,535 epoch 1 - iter 3/4 - loss 0.85789797 - samples/sec: 520.09
2020-06-23 12:11:12,546 ----------------------------------------------------------------------------------------------------
2020-06-23 12:11:12,547 EPOCH 1 done: loss 0.8579 - lr 0.1000
2020-06-23 12:11:31,969 DEV : loss 0.6743201017379761 - score 0.7667
2020-06-23 12:11:32,118 BAD EPOCHS (no improvement): 0


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-06-23 12:11:33,207 ----------------------------------------------------------------------------------------------------
2020-06-23 12:11:33,268 epoch 2 - iter 0/4 - loss 0.66289943 - samples/sec: 545.88
2020-06-23 12:11:33,315 epoch 2 - iter 1/4 - loss 0.71299613 - samples/sec: 795.10
2020-06-23 12:11:33,371 epoch 2 - iter 2/4 - loss 0.71684919 - samples/sec: 684.60
2020-06-23 12:11:33,392 epoch 2 - iter 3/4 - loss 0.59646435 - samples/sec: 3026.13
2020-06-23 12:11:33,398 ----------------------------------------------------------------------------------------------------
2020-06-23 12:11:33,399 EPOCH 2 done: loss 0.5965 - lr 0.1000
2020-06-23 12:11:35,585 DEV : loss 0.9560748934745789 - score 0.7667
2020-06-23 12:11:35,735 BAD EPOCHS (no improvement): 1
2020-06-23 12:11:36,845 ----------------------------------------------------------------------------------------------------
2020-06-23 12:11:36,892 epoch 3 - iter 0/4 - loss 1.06638706 - samples/sec: 708.91
2020-06-23 12:11:36,936

2020-06-23 12:12:01,006 ----------------------------------------------------------------------------------------------------
2020-06-23 12:12:01,007 EPOCH 12 done: loss 0.3100 - lr 0.0500
2020-06-23 12:12:03,293 DEV : loss 0.7280484437942505 - score 0.728
2020-06-23 12:12:03,440 BAD EPOCHS (no improvement): 2
2020-06-23 12:12:03,442 ----------------------------------------------------------------------------------------------------
2020-06-23 12:12:03,480 epoch 13 - iter 0/4 - loss 0.25505537 - samples/sec: 868.18
2020-06-23 12:12:03,534 epoch 13 - iter 1/4 - loss 0.19914441 - samples/sec: 764.66
2020-06-23 12:12:03,584 epoch 13 - iter 2/4 - loss 0.24425759 - samples/sec: 774.43
2020-06-23 12:12:03,602 epoch 13 - iter 3/4 - loss 0.29874956 - samples/sec: 2942.53
2020-06-23 12:12:03,609 ----------------------------------------------------------------------------------------------------
2020-06-23 12:12:03,610 EPOCH 13 done: loss 0.2987 - lr 0.0500
2020-06-23 12:12:05,907 DEV : loss 0.81

In [34]:
# Flair
# https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
total_time = time.time()
word_embeddings = [ # FlairEmbeddings('multi-forward'), # this is 300 languge, gave very low score
                  # FlairEmbeddings('multi-backward'), 
                   FlairEmbeddings('news-forward'),  #  	English 	Trained with 1 billion word corpus                   
                   
                  ]
#modelname = 'Flair-news-fwd'
modelname = 'Flair'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS)

print(time.time() - total_time)

# 167s in 100 train
# 620s for 18k

Flair
2020-05-15 16:16:53,990 Reading data from /home/max/git/newcombined/dataset_hatespeech/input
2020-05-15 16:16:53,991 Train: /home/max/git/newcombined/dataset_hatespeech/input/flair_train.csv
2020-05-15 16:16:53,992 Dev: /home/max/git/newcombined/dataset_hatespeech/input/flair_dev.csv
2020-05-15 16:16:53,993 Test: /home/max/git/newcombined/dataset_hatespeech/input/flair_test.csv


  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-05-15 16:16:55,465 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 92040.90it/s]

2020-05-15 16:16:55,468 [b'1', b'2', b'0']
2020-05-15 16:16:55,469 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 182281.79it/s]

2020-05-15 16:16:55,473 [b'1', b'2', b'0']
2020-05-15 16:16:55,477 ----------------------------------------------------------------------------------------------------
2020-05-15 16:16:55,478 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2048, out_features=256, bias=True)
    (rnn): LSTM(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): CrossEntropyLoss()
)"
2020-05-15 16:16:55,479 ----------------------------------------------------------------------------------------------------




2020-05-15 16:16:55,788 epoch 1 - iter 1/4 - loss 1.03974864 - samples/sec: 199.91
2020-05-15 16:16:55,934 epoch 1 - iter 2/4 - loss 1.02089870 - samples/sec: 234.98
2020-05-15 16:16:55,987 epoch 1 - iter 3/4 - loss 0.99495506 - samples/sec: 766.80
2020-05-15 16:16:55,997 ----------------------------------------------------------------------------------------------------
2020-05-15 16:16:55,998 EPOCH 1 done: loss 0.9950 - lr 0.1000
2020-05-15 16:17:07,322 DEV : loss 0.8543857336044312 - score 0.7667
2020-05-15 16:17:07,459 BAD EPOCHS (no improvement): 0


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-05-15 16:17:07,736 ----------------------------------------------------------------------------------------------------
2020-05-15 16:17:07,793 epoch 2 - iter 0/4 - loss 0.83632404 - samples/sec: 591.07
2020-05-15 16:17:07,840 epoch 2 - iter 1/4 - loss 0.85096899 - samples/sec: 889.78
2020-05-15 16:17:07,890 epoch 2 - iter 2/4 - loss 0.83957122 - samples/sec: 824.34
2020-05-15 16:17:07,915 epoch 2 - iter 3/4 - loss 0.76601796 - samples/sec: 3255.42
2020-05-15 16:17:07,930 ----------------------------------------------------------------------------------------------------
2020-05-15 16:17:07,931 EPOCH 2 done: loss 0.7660 - lr 0.1000
2020-05-15 16:17:09,805 DEV : loss 0.7338841557502747 - score 0.7667
2020-05-15 16:17:09,942 BAD EPOCHS (no improvement): 1
2020-05-15 16:17:10,213 ----------------------------------------------------------------------------------------------------
2020-05-15 16:17:10,261 epoch 3 - iter 0/4 - loss 0.81052619 - samples/sec: 709.31
2020-05-15 16:17:10,310

2020-05-15 16:17:33,044 ----------------------------------------------------------------------------------------------------
2020-05-15 16:17:33,045 EPOCH 12 done: loss 0.6617 - lr 0.0500
2020-05-15 16:17:34,992 DEV : loss 0.6712132692337036 - score 0.7667
2020-05-15 16:17:35,126 BAD EPOCHS (no improvement): 5
2020-05-15 16:17:35,379 ----------------------------------------------------------------------------------------------------
2020-05-15 16:17:35,419 epoch 13 - iter 0/4 - loss 0.65164173 - samples/sec: 849.37
2020-05-15 16:17:35,474 epoch 13 - iter 1/4 - loss 0.63884535 - samples/sec: 759.30
2020-05-15 16:17:35,526 epoch 13 - iter 2/4 - loss 0.66198879 - samples/sec: 766.89
2020-05-15 16:17:35,552 epoch 13 - iter 3/4 - loss 0.74194731 - samples/sec: 2931.03
2020-05-15 16:17:35,567 ----------------------------------------------------------------------------------------------------
2020-05-15 16:17:35,568 EPOCH 13 done: loss 0.7419 - lr 0.0500
2020-05-15 16:17:37,529 DEV : loss 0.6

In [35]:
# BERT OOMs on 32 batch, use 8
total_time = time.time()

# Bert Cased - separating lower and upper case, uncased - ignoring case.
# Use cased

word_embeddings = [ BertEmbeddings('bert-base-cased'),                ]
modelname = 'bert-base-cased'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS, batch_size=8)

#word_embeddings = [ BertEmbeddings('bert-base-uncased'),                ]
#modelname = 'bert-base-uncased'
#train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS, batch_size=8)

#BPE - takes memory - reduce batch size radically!
word_embeddings = [ BytePairEmbeddings('en'),   ]
modelname = 'BytePairEmbedding'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS, batch_size=8)               

print(time.time() - total_time)

# 450 s 18k

bert-base-cased
2020-05-15 16:19:28,381 Reading data from /home/max/git/newcombined/dataset_hatespeech/input
2020-05-15 16:19:28,382 Train: /home/max/git/newcombined/dataset_hatespeech/input/flair_train.csv
2020-05-15 16:19:28,383 Dev: /home/max/git/newcombined/dataset_hatespeech/input/flair_dev.csv
2020-05-15 16:19:28,383 Test: /home/max/git/newcombined/dataset_hatespeech/input/flair_test.csv


  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-05-15 16:19:29,825 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 214542.40it/s]

2020-05-15 16:19:29,828 [b'1', b'2', b'0']
2020-05-15 16:19:29,829 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 262965.77it/s]

2020-05-15 16:19:29,831 [b'1', b'2', b'0']
2020-05-15 16:19:29,837 ----------------------------------------------------------------------------------------------------
2020-05-15 16:19:29,841 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): BertEmbeddings(
        (model): BertModel(
          (embeddings): BertEmbeddings(
            (word_embeddings): Embedding(28996, 768, padding_idx=0)
            (position_embeddings): Embedding(512, 768)
            (token_type_embeddings): Embedding(2, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): BertEncoder(
            (layer): ModuleList(
              (0): BertLayer(
                (attention): BertAttention(
                  (self): BertSelfAttention(
                    (query): Linear(in_features=768, out_features=768, bias=True)

2020-05-15 16:19:29,842 ----------------------------------------------------------------------------------------------------
2020-05-15 16:19:29,843 Corpus: "Corpus: 100 train + 3000 dev + 3000 test sentences"
2020-05-15 16:19:29,843 ----------------------------------------------------------------------------------------------------
2020-05-15 16:19:29,844 Parameters:
2020-05-15 16:19:29,844  - learning_rate: "0.1"
2020-05-15 16:19:29,845  - mini_batch_size: "8"
2020-05-15 16:19:29,846  - patience: "5"
2020-05-15 16:19:29,846  - anneal_factor: "0.5"
2020-05-15 16:19:29,847  - max_epochs: "15"
2020-05-15 16:19:29,848  - shuffle: "True"
2020-05-15 16:19:29,848  - train_with_dev: "False"
2020-05-15 16:19:29,849  - batch_growth_annealing: "False"
2020-05-15 16:19:29,849 ----------------------------------------------------------------------------------------------------
2020-05-15 16:19:29,850 Model training base path: "."
2020-05-15 16:19:29,851 --------------------------------------------




2020-05-15 16:19:30,090 epoch 1 - iter 0/13 - loss 1.41461110 - samples/sec: 34.40
2020-05-15 16:19:30,558 epoch 1 - iter 1/13 - loss 1.20395559 - samples/sec: 24.35
2020-05-15 16:19:31,002 epoch 1 - iter 2/13 - loss 1.07781065 - samples/sec: 24.69
2020-05-15 16:19:31,335 epoch 1 - iter 3/13 - loss 1.37533584 - samples/sec: 38.41
2020-05-15 16:19:31,682 epoch 1 - iter 4/13 - loss 1.32350943 - samples/sec: 35.39
2020-05-15 16:19:32,022 epoch 1 - iter 5/13 - loss 1.16319876 - samples/sec: 36.86
2020-05-15 16:19:32,619 epoch 1 - iter 6/13 - loss 1.09168454 - samples/sec: 16.78
2020-05-15 16:19:33,121 epoch 1 - iter 7/13 - loss 1.02606053 - samples/sec: 21.14
2020-05-15 16:19:33,481 epoch 1 - iter 8/13 - loss 1.00356418 - samples/sec: 33.70
2020-05-15 16:19:33,757 epoch 1 - iter 9/13 - loss 1.00424483 - samples/sec: 50.61
2020-05-15 16:19:34,130 epoch 1 - iter 10/13 - loss 1.04102479 - samples/sec: 32.07
2020-05-15 16:19:34,501 epoch 1 - iter 11/13 - loss 1.05993107 - samples/sec: 32.20
20

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-05-15 16:21:06,661 ----------------------------------------------------------------------------------------------------
2020-05-15 16:21:06,686 epoch 2 - iter 0/13 - loss 0.52802789 - samples/sec: 375.81
2020-05-15 16:21:06,832 epoch 2 - iter 1/13 - loss 0.43738170 - samples/sec: 451.46
2020-05-15 16:21:06,979 epoch 2 - iter 2/13 - loss 0.67428387 - samples/sec: 417.62
2020-05-15 16:21:07,126 epoch 2 - iter 3/13 - loss 0.74903423 - samples/sec: 455.77
2020-05-15 16:21:07,264 epoch 2 - iter 4/13 - loss 0.90052343 - samples/sec: 521.10
2020-05-15 16:21:07,406 epoch 2 - iter 5/13 - loss 0.91421305 - samples/sec: 475.56
2020-05-15 16:21:07,547 epoch 2 - iter 6/13 - loss 0.88897076 - samples/sec: 426.80
2020-05-15 16:21:07,690 epoch 2 - iter 7/13 - loss 1.02994783 - samples/sec: 442.89
2020-05-15 16:21:07,833 epoch 2 - iter 8/13 - loss 1.01186102 - samples/sec: 450.61
2020-05-15 16:21:07,970 epoch 2 - iter 9/13 - loss 0.97415586 - samples/sec: 534.95
2020-05-15 16:21:08,110 epoch 2 - i

2020-05-15 16:21:34,759 epoch 7 - iter 5/13 - loss 0.56057896 - samples/sec: 527.27
2020-05-15 16:21:34,920 epoch 7 - iter 6/13 - loss 0.70415018 - samples/sec: 489.75
2020-05-15 16:21:35,073 epoch 7 - iter 7/13 - loss 0.71458572 - samples/sec: 569.11
2020-05-15 16:21:35,230 epoch 7 - iter 8/13 - loss 0.72167468 - samples/sec: 481.70
2020-05-15 16:21:35,388 epoch 7 - iter 9/13 - loss 0.69868794 - samples/sec: 416.98
2020-05-15 16:21:35,554 epoch 7 - iter 10/13 - loss 0.64447232 - samples/sec: 432.16
2020-05-15 16:21:35,711 epoch 7 - iter 11/13 - loss 0.63086717 - samples/sec: 512.96
2020-05-15 16:21:35,873 epoch 7 - iter 12/13 - loss 0.60111670 - samples/sec: 582.54
2020-05-15 16:21:36,004 ----------------------------------------------------------------------------------------------------
2020-05-15 16:21:36,004 EPOCH 7 done: loss 0.6011 - lr 0.1000
2020-05-15 16:21:38,544 DEV : loss 0.7002679705619812 - score 0.747
Epoch     6: reducing learning rate of group 0 to 5.0000e-02.
2020-05-

2020-05-15 16:21:59,663 epoch 12 - iter 11/13 - loss 0.45356869 - samples/sec: 506.10
2020-05-15 16:21:59,811 epoch 12 - iter 12/13 - loss 0.43297456 - samples/sec: 549.87
2020-05-15 16:21:59,938 ----------------------------------------------------------------------------------------------------
2020-05-15 16:21:59,939 EPOCH 12 done: loss 0.4330 - lr 0.0500
2020-05-15 16:22:02,448 DEV : loss 0.6851819753646851 - score 0.7423
2020-05-15 16:22:02,589 BAD EPOCHS (no improvement): 4
2020-05-15 16:22:02,590 ----------------------------------------------------------------------------------------------------
2020-05-15 16:22:02,610 epoch 13 - iter 0/13 - loss 0.23217896 - samples/sec: 482.25
2020-05-15 16:22:02,756 epoch 13 - iter 1/13 - loss 0.23361129 - samples/sec: 476.19
2020-05-15 16:22:02,897 epoch 13 - iter 2/13 - loss 0.16635740 - samples/sec: 480.15
2020-05-15 16:22:03,037 epoch 13 - iter 3/13 - loss 0.42903942 - samples/sec: 472.20
2020-05-15 16:22:03,180 epoch 13 - iter 4/13 - loss

  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-05-15 16:25:03,241 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 93020.71it/s]

2020-05-15 16:25:03,246 [b'1', b'2', b'0']
2020-05-15 16:25:03,247 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 169398.38it/s]

2020-05-15 16:25:03,251 [b'1', b'2', b'0']
2020-05-15 16:25:03,254 ----------------------------------------------------------------------------------------------------
2020-05-15 16:25:03,255 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): BytePairEmbeddings(model=0-bpe-en-100000-50)
    )
    (word_reprojection_map): Linear(in_features=100, out_features=256, bias=True)
    (rnn): LSTM(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=3, bias=True)
  (loss_function): CrossEntropyLoss()
)"
2020-05-15 16:25:03,256 ----------------------------------------------------------------------------------------------------
2020-05-15 16:25:03,258 Corpus: "Corpus: 100 train + 3000 dev + 3000 test sentences"
2020-05-15 16:25:03,258 ----------------------------------------------------------------------------------------------------
2020-05




2020-05-15 16:25:03,465 epoch 1 - iter 4/13 - loss 0.92073661 - samples/sec: 276.75
2020-05-15 16:25:03,500 epoch 1 - iter 5/13 - loss 0.87916565 - samples/sec: 267.92
2020-05-15 16:25:03,541 epoch 1 - iter 6/13 - loss 0.82724278 - samples/sec: 226.60
2020-05-15 16:25:03,586 epoch 1 - iter 7/13 - loss 0.79553075 - samples/sec: 203.32
2020-05-15 16:25:03,624 epoch 1 - iter 8/13 - loss 0.81103737 - samples/sec: 306.21
2020-05-15 16:25:03,658 epoch 1 - iter 9/13 - loss 0.83728761 - samples/sec: 276.29
2020-05-15 16:25:03,697 epoch 1 - iter 10/13 - loss 0.85851285 - samples/sec: 248.32
2020-05-15 16:25:03,731 epoch 1 - iter 11/13 - loss 0.82654532 - samples/sec: 303.97
2020-05-15 16:25:03,755 epoch 1 - iter 12/13 - loss 0.83152409 - samples/sec: 446.98
2020-05-15 16:25:03,764 ----------------------------------------------------------------------------------------------------
2020-05-15 16:25:03,765 EPOCH 1 done: loss 0.8315 - lr 0.1000
2020-05-15 16:25:11,260 DEV : loss 0.6747785806655884 

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-05-15 16:25:12,230 ----------------------------------------------------------------------------------------------------
2020-05-15 16:25:12,253 epoch 2 - iter 0/13 - loss 0.79609919 - samples/sec: 389.72
2020-05-15 16:25:12,274 epoch 2 - iter 1/13 - loss 0.55464861 - samples/sec: 517.61
2020-05-15 16:25:12,302 epoch 2 - iter 2/13 - loss 0.55214993 - samples/sec: 376.95
2020-05-15 16:25:12,324 epoch 2 - iter 3/13 - loss 0.60946737 - samples/sec: 500.39


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-05-15 16:25:12,352 epoch 2 - iter 4/13 - loss 0.64863864 - samples/sec: 510.80
2020-05-15 16:25:12,374 epoch 2 - iter 5/13 - loss 0.65376938 - samples/sec: 508.08
2020-05-15 16:25:12,401 epoch 2 - iter 6/13 - loss 0.62946188 - samples/sec: 441.76
2020-05-15 16:25:12,422 epoch 2 - iter 7/13 - loss 0.67950963 - samples/sec: 516.87
2020-05-15 16:25:12,449 epoch 2 - iter 8/13 - loss 0.68462497 - samples/sec: 386.93
2020-05-15 16:25:12,474 epoch 2 - iter 9/13 - loss 0.66254631 - samples/sec: 429.56
2020-05-15 16:25:12,498 epoch 2 - iter 10/13 - loss 0.64845571 - samples/sec: 526.15
2020-05-15 16:25:12,527 epoch 2 - iter 11/13 - loss 0.67873405 - samples/sec: 482.43
2020-05-15 16:25:12,548 epoch 2 - iter 12/13 - loss 0.65932834 - samples/sec: 797.81
2020-05-15 16:25:12,554 ----------------------------------------------------------------------------------------------------
2020-05-15 16:25:12,555 EPOCH 2 done: loss 0.6593 - lr 0.1000
2020-05-15 16:25:14,878 DEV : loss 0.6721192002296448 

2020-05-15 16:25:28,307 epoch 7 - iter 11/13 - loss 0.58229414 - samples/sec: 608.88
2020-05-15 16:25:28,324 epoch 7 - iter 12/13 - loss 0.57068111 - samples/sec: 758.17
2020-05-15 16:25:28,334 ----------------------------------------------------------------------------------------------------
2020-05-15 16:25:28,335 EPOCH 7 done: loss 0.5707 - lr 0.1000
2020-05-15 16:25:31,000 DEV : loss 0.6939222812652588 - score 0.7663
2020-05-15 16:25:31,149 BAD EPOCHS (no improvement): 3
2020-05-15 16:25:31,150 ----------------------------------------------------------------------------------------------------
2020-05-15 16:25:31,167 epoch 8 - iter 0/13 - loss 0.29959819 - samples/sec: 524.85
2020-05-15 16:25:31,194 epoch 8 - iter 1/13 - loss 0.52598937 - samples/sec: 502.93
2020-05-15 16:25:31,220 epoch 8 - iter 2/13 - loss 0.44922308 - samples/sec: 540.19
2020-05-15 16:25:31,244 epoch 8 - iter 3/13 - loss 0.57538338 - samples/sec: 456.77
2020-05-15 16:25:31,271 epoch 8 - iter 4/13 - loss 0.50351

2020-05-15 16:25:46,051 ----------------------------------------------------------------------------------------------------
2020-05-15 16:25:46,067 epoch 13 - iter 0/13 - loss 0.71879673 - samples/sec: 563.31
2020-05-15 16:25:46,088 epoch 13 - iter 1/13 - loss 0.49519822 - samples/sec: 558.13
2020-05-15 16:25:46,107 epoch 13 - iter 2/13 - loss 0.44308680 - samples/sec: 692.49
2020-05-15 16:25:46,127 epoch 13 - iter 3/13 - loss 0.49287412 - samples/sec: 601.19
2020-05-15 16:25:46,160 epoch 13 - iter 4/13 - loss 0.49923037 - samples/sec: 434.40
2020-05-15 16:25:46,183 epoch 13 - iter 5/13 - loss 0.45436286 - samples/sec: 489.58
2020-05-15 16:25:46,212 epoch 13 - iter 6/13 - loss 0.44483879 - samples/sec: 372.55
2020-05-15 16:25:46,243 epoch 13 - iter 7/13 - loss 0.46565047 - samples/sec: 409.26
2020-05-15 16:25:46,268 epoch 13 - iter 8/13 - loss 0.47630741 - samples/sec: 586.07
2020-05-15 16:25:46,297 epoch 13 - iter 9/13 - loss 0.46855751 - samples/sec: 358.03
2020-05-15 16:25:46,328 e

In [36]:
# GPT-1

total_time = time.time()
#Do we need batchsize 8 here?

word_embeddings = [ OpenAIGPTEmbeddings(),                ]
modelname = 'gpt-1'
train_and_predict(word_embeddings, modelname, modeldesc, savelist=savelist, epochs=EPOCHS, batch_size=8)

print(time.time() - total_time)

# 302 sec 18k

gpt-1
2020-05-15 16:26:16,286 Reading data from /home/max/git/newcombined/dataset_hatespeech/input
2020-05-15 16:26:16,286 Train: /home/max/git/newcombined/dataset_hatespeech/input/flair_train.csv
2020-05-15 16:26:16,287 Dev: /home/max/git/newcombined/dataset_hatespeech/input/flair_dev.csv
2020-05-15 16:26:16,287 Test: /home/max/git/newcombined/dataset_hatespeech/input/flair_test.csv


  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


2020-05-15 16:26:17,748 Computing label dictionary. Progress:


100%|██████████| 100/100 [00:00<00:00, 205200.78it/s]

2020-05-15 16:26:17,751 [b'1', b'2', b'0']
2020-05-15 16:26:17,752 Computing label dictionary. Progress:



100%|██████████| 100/100 [00:00<00:00, 259870.14it/s]

2020-05-15 16:26:17,754 [b'1', b'2', b'0']
2020-05-15 16:26:17,762 ----------------------------------------------------------------------------------------------------
2020-05-15 16:26:17,764 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): OpenAIGPTEmbeddings(
        model=0-openai-gpt
        (model): OpenAIGPTModel(
          (tokens_embed): Embedding(40478, 768)
          (positions_embed): Embedding(512, 768)
          (drop): Dropout(p=0.1, inplace=False)
          (h): ModuleList(
            (0): Block(
              (attn): Attention(
                (c_attn): Conv1D()
                (c_proj): Conv1D()
                (attn_dropout): Dropout(p=0.1, inplace=False)
                (resid_dropout): Dropout(p=0.1, inplace=False)
              )
              (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): MLP(
                (c_fc): Conv1D()
                (

2020-05-15 16:26:17,765 Corpus: "Corpus: 100 train + 3000 dev + 3000 test sentences"
2020-05-15 16:26:17,766 ----------------------------------------------------------------------------------------------------
2020-05-15 16:26:17,766 Parameters:
2020-05-15 16:26:17,766  - learning_rate: "0.1"
2020-05-15 16:26:17,767  - mini_batch_size: "8"
2020-05-15 16:26:17,767  - patience: "5"
2020-05-15 16:26:17,767  - anneal_factor: "0.5"
2020-05-15 16:26:17,768  - max_epochs: "15"
2020-05-15 16:26:17,768  - shuffle: "True"
2020-05-15 16:26:17,768  - train_with_dev: "False"
2020-05-15 16:26:17,769  - batch_growth_annealing: "False"
2020-05-15 16:26:17,769 ----------------------------------------------------------------------------------------------------
2020-05-15 16:26:17,769 Model training base path: "."
2020-05-15 16:26:17,770 ----------------------------------------------------------------------------------------------------
2020-05-15 16:26:17,770 Device: cuda:0
2020-05-15 16:26:17,771 -----




2020-05-15 16:26:18,228 epoch 1 - iter 1/13 - loss 1.02366441 - samples/sec: 46.87
2020-05-15 16:26:18,507 epoch 1 - iter 2/13 - loss 0.89995372 - samples/sec: 44.56
2020-05-15 16:26:18,777 epoch 1 - iter 3/13 - loss 0.89521576 - samples/sec: 49.18
2020-05-15 16:26:19,048 epoch 1 - iter 4/13 - loss 0.90249537 - samples/sec: 48.36
2020-05-15 16:26:19,305 epoch 1 - iter 5/13 - loss 0.86521447 - samples/sec: 50.51
2020-05-15 16:26:19,589 epoch 1 - iter 6/13 - loss 0.80853843 - samples/sec: 43.72
2020-05-15 16:26:19,881 epoch 1 - iter 7/13 - loss 0.76403137 - samples/sec: 44.48
2020-05-15 16:26:20,148 epoch 1 - iter 8/13 - loss 0.76629166 - samples/sec: 48.27
2020-05-15 16:26:20,410 epoch 1 - iter 9/13 - loss 0.77882037 - samples/sec: 49.42
2020-05-15 16:26:20,675 epoch 1 - iter 10/13 - loss 0.79162548 - samples/sec: 49.06
2020-05-15 16:26:20,932 epoch 1 - iter 11/13 - loss 0.75281586 - samples/sec: 50.92
2020-05-15 16:26:21,111 epoch 1 - iter 12/13 - loss 0.76204223 - samples/sec: 106.16


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-05-15 16:27:23,673 ----------------------------------------------------------------------------------------------------
2020-05-15 16:27:23,696 epoch 2 - iter 0/13 - loss 0.63897491 - samples/sec: 430.39
2020-05-15 16:27:23,818 epoch 2 - iter 1/13 - loss 0.45528595 - samples/sec: 486.47
2020-05-15 16:27:23,939 epoch 2 - iter 2/13 - loss 0.49192606 - samples/sec: 506.12
2020-05-15 16:27:24,061 epoch 2 - iter 3/13 - loss 0.53844932 - samples/sec: 460.98
2020-05-15 16:27:24,183 epoch 2 - iter 4/13 - loss 0.56347318 - samples/sec: 528.67
2020-05-15 16:27:24,303 epoch 2 - iter 5/13 - loss 0.58326705 - samples/sec: 600.20
2020-05-15 16:27:24,431 epoch 2 - iter 6/13 - loss 0.57723636 - samples/sec: 438.36
2020-05-15 16:27:24,552 epoch 2 - iter 7/13 - loss 0.60624900 - samples/sec: 466.90
2020-05-15 16:27:24,681 epoch 2 - iter 8/13 - loss 0.61534938 - samples/sec: 406.37
2020-05-15 16:27:24,807 epoch 2 - iter 9/13 - loss 0.59594664 - samples/sec: 554.91
2020-05-15 16:27:24,928 epoch 2 - i

2020-05-15 16:27:50,242 epoch 7 - iter 5/13 - loss 0.21901545 - samples/sec: 552.09
2020-05-15 16:27:50,364 epoch 7 - iter 6/13 - loss 0.23229106 - samples/sec: 477.70
2020-05-15 16:27:50,484 epoch 7 - iter 7/13 - loss 0.26107138 - samples/sec: 574.04
2020-05-15 16:27:50,604 epoch 7 - iter 8/13 - loss 0.26079476 - samples/sec: 486.37
2020-05-15 16:27:50,728 epoch 7 - iter 9/13 - loss 0.26099357 - samples/sec: 429.84
2020-05-15 16:27:50,849 epoch 7 - iter 10/13 - loss 0.24235101 - samples/sec: 491.15
2020-05-15 16:27:50,967 epoch 7 - iter 11/13 - loss 0.27669199 - samples/sec: 563.88
2020-05-15 16:27:51,107 epoch 7 - iter 12/13 - loss 0.27300339 - samples/sec: 554.26
2020-05-15 16:27:51,266 ----------------------------------------------------------------------------------------------------
2020-05-15 16:27:51,267 EPOCH 7 done: loss 0.2730 - lr 0.1000
2020-05-15 16:27:54,252 DEV : loss 0.688732922077179 - score 0.7683
2020-05-15 16:27:54,388 BAD EPOCHS (no improvement): 2
2020-05-15 16:2

2020-05-15 16:28:14,586 epoch 12 - iter 11/13 - loss 0.12967260 - samples/sec: 433.34
2020-05-15 16:28:14,703 epoch 12 - iter 12/13 - loss 0.14627472 - samples/sec: 436.42
2020-05-15 16:28:14,803 ----------------------------------------------------------------------------------------------------
2020-05-15 16:28:14,803 EPOCH 12 done: loss 0.1463 - lr 0.1000
2020-05-15 16:28:17,459 DEV : loss 1.060694932937622 - score 0.7683
2020-05-15 16:28:17,601 BAD EPOCHS (no improvement): 1
2020-05-15 16:28:17,602 ----------------------------------------------------------------------------------------------------
2020-05-15 16:28:17,622 epoch 13 - iter 0/13 - loss 0.19827484 - samples/sec: 463.82
2020-05-15 16:28:17,741 epoch 13 - iter 1/13 - loss 0.10357209 - samples/sec: 474.06
2020-05-15 16:28:17,863 epoch 13 - iter 2/13 - loss 0.08482356 - samples/sec: 484.10
2020-05-15 16:28:18,018 epoch 13 - iter 3/13 - loss 0.06928484 - samples/sec: 460.17
2020-05-15 16:28:18,137 epoch 13 - iter 4/13 - loss 

### SAVE

In [37]:
#name='all_flair_512LSTM_15ep_8model'
name='FINAL_flair_all_trainsz_'+str(trainsize)
print(len(savelist))
saveResults(savelist, name=name)

9


In [38]:
len(savelist)

9

In [39]:
trainsize

100

### DONE

In [None]:
# Code for combining results from 2 training times into 1 list etc.

In [None]:
# add the 2 last
name='all_flair_512LSTM_15ep_model'
name = 'FINAL_flair_all_trainsz_'+str(trainsize)
te = loadResults(name)

In [145]:
#newlist = te+savelist
newlist = te
len(newlist)

9

In [146]:
# cut doubles away
# newlist = newlist[5:]  

In [147]:
# save combined
name='FINAL_flair_all_trainsz_'+str(trainsize)
print(len(newlist))
saveResults(newlist, name=name)

9


In [None]:
# move list item 5 to end
# te[5]
# te.append(te.pop(5))


In [41]:
len(savelist)

10

In [69]:
te = savelist[0:5] + [savelist[9]] + savelist[6:9]
len(te)

9

In [131]:
for i in savelist:
    print(i['model'])

Flair
bert-base-uncased


In [139]:
for i in te:
    print(i['model'])

glove
fasttext web-crawl
fasttext news/wiki
en-twitter
elmo
Flair
bert-base-uncased
BytePairEmbedding
gpt-1


In [75]:
savelist[9]

{'model': 'Flair', 'labels': 0       1
 1       1
 2       1
 3       1
 4       1
        ..
 2995    1
 2996    1
 2997    1
 2998    1
 2999    1
 Name: label, Length: 3000, dtype: int64, 'confidence': 0       0.876461
 1       0.711614
 2       0.925566
 3       0.895800
 4       0.885994
           ...   
 2995    0.751575
 2996    0.940553
 2997    0.902379
 2998    0.835532
 2999    0.964116
 Name: confidence, Length: 3000, dtype: float64, 'traintime': 125.29585218429565, 'predtime3k': 1589372012.3927588, 'modeldesc': '512LSTM_15epoch_non-bi'}

In [130]:
savelist[0]['model']='Flair'

In [132]:
trainsize

1000

In [140]:
te[5]

{'model': 'Flair', 'labels': 0       1
 1       1
 2       1
 3       1
 4       1
        ..
 2995    1
 2996    1
 2997    1
 2998    1
 2999    1
 Name: label, Length: 3000, dtype: int64, 'confidence': 0       0.805744
 1       0.546344
 2       0.802081
 3       0.714745
 4       0.753530
           ...   
 2995    0.759887
 2996    0.893057
 2997    0.859725
 2998    0.813483
 2999    0.864573
 Name: confidence, Length: 3000, dtype: float64, 'traintime': 81.63569831848145, 'predtime3k': 1589378399.4414687, 'modeldesc': '512LSTM_15epoch_non-bi'}

In [138]:
te[5] = savelist[0]

In [142]:
te[6] = savelist[1]

In [149]:
for i in newlist:
    print(i['model'])

glove
fasttext web-crawl
fasttext news/wiki
en-twitter
elmo
Flair
bert-base-uncased
BytePairEmbedding
gpt-1


In [150]:
savelist[1]

{'model': 'bert-base-uncased', 'labels': 0       1
 1       1
 2       1
 3       1
 4       1
        ..
 2995    1
 2996    2
 2997    2
 2998    2
 2999    1
 Name: label, Length: 3000, dtype: int64, 'confidence': 0       0.996391
 1       0.995925
 2       0.740776
 3       0.996946
 4       0.999737
           ...   
 2995    0.997924
 2996    0.775749
 2997    0.926272
 2998    0.993099
 2999    0.947747
 Name: confidence, Length: 3000, dtype: float64, 'traintime': 273.37105560302734, 'predtime3k': 1589378548.903775, 'modeldesc': '512LSTM_15epoch_non-bi'}