In [1]:
from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import TextClassificationProcessor
from farm.modeling.optimization import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
import pandas as pd

07/01/2020 17:23:34 - INFO - transformers.file_utils -   PyTorch version 1.5.0+cu92 available.
07/01/2020 17:23:44 - INFO - transformers.file_utils -   TensorFlow version 2.1.0 available.


In [2]:
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32 # larger batch sizes might use too much computing power in Colab
evaluate_every = 100

07/01/2020 17:23:48 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None


In [3]:
lang_model = "xlnet-base-cased"
do_lower_case = True

tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=do_lower_case)

07/01/2020 17:23:52 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'XLNetTokenizer'
07/01/2020 17:24:02 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at /home/ubuntu/.cache/torch/transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8


In [4]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return t
        
    return topic

In [5]:
class CustomTextClassificationProcessor(TextClassificationProcessor):
  
    # we need to overwrite this function from the parent class
    def file_to_dicts(self, file: str) -> [dict]:
        # read into df
        df = pd.read_csv(file)
        #df = df.drop(['date'], axis=1)
        df = df.drop(df[df.topic == 'admin'].index)
        df = df.drop(df[df.transcript.str.split().map(len) < 10].index)
        df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

        df.columns = ["text_classification_label","text"]
        dicts = df.to_dict(orient="records")
        return dicts

In [8]:
label_list = ['Agriculture, animals, food and rural affairs', 'Asylum, immigration and nationality',
              'Business, industry and consumers', 'Communities and families',
              'Crime, civil law, justice and rights', 'Culture, media and sport', 'Defence',
              'Economy and finance', 'Education', 'Employment and training',
              'Energy and environment', 'European Union', 'Health services and medicine',
              'Housing and planning', 'International affairs', 'Parliament, government and politics',
              'Science and technology', 'Social security and pensions', 'Social services', 'Transport',
              'Others'] #labels in our data set

metric = "f1_macro" # desired metric for evaluation

processor = CustomTextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=512, # BERT can only handle sequence lengths of up to 512
                                            data_dir='data/', 
                                            label_list=label_list,
                                            metric=metric,
                                            quote_char='"',
                                            multilabel=False,
                                            train_filename="2012_debate.csv",
                                            dev_filename=None,
                                            test_filename="2013_debate.csv",
                                            dev_split=0.1 # this will extract 10% of the train set to create a dev set
                                            )

In [None]:
data_silo = DataSilo(
    processor=processor,
    batch_size=batch_size)

07/01/2020 20:53:27 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
07/01/2020 20:53:27 - INFO - farm.data_handler.data_silo -   Loading train set from: data/2012_debate.csv 
07/01/2020 20:53:30 - INFO - farm.data_handler.data_silo -   Got ya 31 parallel workers to convert 1792 dictionaries to pytorch datasets (chunksize = 12)...
07/01/2020 20:53:30 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
07/01/2020 20:53:30 - INFO - farm.data_handler.data_silo -   /|\  /|\  /|\  /|\  /|\  /|\  /|\  /w\  /w\  /|\  /|\  /|\  /|\  /|\  /|\  /w\  /w\  /|\  /w\  /w\  /|\  /w\  /w\  /|\  /w\  /w\  /|\  /w\  /w\  /w\  /w\
07/01/2020 20:53:30 - INFO - farm.data_handler.data_silo -   /'\

07/01/2020 20:53:42 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: 4-0
Clear Text: 
 	text_classification_label: Employment and training
 	text: Before I call the first speaker, may I say to both Front Benchers that a large number of Back Benchers have signified that they wish to take part in the debate? I ask them for some time constraint in their opening speeches to allow as many Back Benchers as possible an opportunity to speak.  I beg to move,  That this House notes with concern that unemployment has risen to its highest level for 17 ye

Preprocessing Dataset data/2012_debate.csv: 100%|██████████| 1792/1792 [16:02<00:00,  1.86 Dicts/s]
07/01/2020 21:09:32 - INFO - farm.data_handler.data_silo -   Loading dev set as a slice of train set
07/01/2020 21:09:32 - INFO - farm.data_handler.data_silo -   Took 184 samples out of train set to create dev set (dev split is roughly 0.1)
07/01/2020 21:09:32 - INFO - farm.data_handler.data_silo -   Loading test set from: data/2013_debate.csv
07/01/2020 21:09:35 - INFO - farm.data_handler.data_silo -   Got ya 31 parallel workers to convert 1901 dictionaries to pytorch datasets (chunksize = 13)...
07/01/2020 21:09:35 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
07/01/2020 21:09:35 - INFO - farm.data_handler.data_silo -   /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /w\  /|\  /w\  /|\  /w\  /w\  /w\  /|\  /w\  /

07/01/2020 21:09:36 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: 3-0
Clear Text: 
 	text_classification_label: Asylum, immigration and nationality
 	text: 4.  What assessment her Department has made of the most recent statistics on net migration. [135581]  7.  What assessment her Department has made of the most recent statistics on net migration. [135585]  12.  What assessment her Department has made of the most recent statistics on net migration. [135591]   Net migration fell by a quarter in the year to March 2012. This shows that our to

Preprocessing Dataset data/2013_debate.csv:   2%|▏         | 39/1901 [01:58<1:18:42,  2.54s/ Dicts]

In [47]:
# loading the pretrained BERT base cased model
language_model = LanguageModel.load(lang_model)
# prediction head for our model that is suited for classifying news article genres
prediction_head = TextClassificationHead(
    class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list))

model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

07/01/2020 16:53:10 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/xlnet-base-cased-pytorch_model.bin from cache at /home/ubuntu/.cache/torch/transformers/33d6135fea0154c088449506a4c5f9553cb59b6fd040138417a7033af64bb8f9.7eac4fe898a021204e63c88c00ea68c60443c57f94b4bc3c02adbde6465745ac
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
 'Asylum, immigration and nationality' 'Business, industry and consumers'
 'Communities and families' 'Crime, civil law, justice and rights'
 'Culture, media and sport' 'Defence' 'Economy and finance' 'Education'
 'Employment and training' 'Energy and environment' 'European Union'
 'Health services and medicine' 'Housing and planning'
 'International affairs' 'Parliament, government and politics'
 'Science and technology' 'Social security and pensions' 'Social services'
 'Transport' 'Others'], y=['Agriculture, animals, food and rural affairs', 'Asylum, immig

07/01/2020 16:53:13 - INFO - farm.modeling.prediction_head -   Using class weights for task 'text_classification': [ 1.9890109   5.5408163   0.7835498   0.825228    0.36938775  1.5210084
  1.0206767   0.6464286   0.91260505  0.90199333  0.8431677   4.3095236
  0.61564624  8.619047    0.69260204  0.3447619   7.757143   11.081633
 15.514286    0.95767194  1.1081632 ]
07/01/2020 16:53:13 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 3e-05}'
07/01/2020 16:53:13 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
07/01/2020 16:53:13 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 5.1000000000000005, 'num_training_steps': 51}'


In [48]:
model.connect_heads_with_processor(processor.tasks, require_labels=True)

In [49]:
trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every, # we defined this value in the setup section. We set it to 100
        device=device)

In [50]:
trainer.train()

07/01/2020 16:53:13 - INFO - farm.train -   
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `






  0%|          | 0/51 [00:00<?, ?it/s][A[A[A[A[A




Train epoch 0/0 (Cur. train loss: 0.0000):   0%|          | 0/51 [00:00<?, ?it/s][A[A[A[A[A

RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 3221225472 bytes. Error code 12 (Cannot allocate memory)
