## Import Libraries

In [1]:
'''basics'''
import os
import sys

!pip install pandas
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
device_name= tf.test.gpu_device_name()

print('Found GPU at: {}'.format(device_name))

Found GPU at: 


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)


'Quadro M4000'

## Import data and holdout data for prediction

In [4]:
with open('../../data/processed/encoded_labels/technical_team.csv', 'rb') as handle:
    df = pd.read_csv(handle)
          
df['all_text_clean'] = df['all_text_clean']  
df = df.drop(['technical_team'], axis=1)    
df.head()

Unnamed: 0,PIMS_ID,all_text_clean,all_text_clean_spacy,chemicals_and_waste_programme,climate_change_adaptation_programme,climate_change_mitigation_programme,climate_strategies_and_policy_programme,ecosystems_and_biodiversity_programme,energy_program_of_climate_change_mitigation,oceans_and_water_programme
0,1584,this programme will contribute to the protecti...,programme contribute protection biological div...,0,0,0,0,1,0,0.0
1,1878,the project contributes to the number of gef p...,project contribute number GEF project support ...,0,0,0,0,1,0,0.0
2,2006,the cape floristic region cfr biodiversity hot...,Cape Floristic Region CFR Biodiversity Hotspot...,0,0,0,0,1,0,0.0
3,2047,the niger delta region of nigeria covering an ...,Niger Delta region Nigeria cover area km large...,0,0,0,0,1,0,0.0
4,2204,the government of south africa has developed t...,Government South Africa develop Cape Action Pe...,0,0,0,0,1,0,0.0


In [5]:
print('average text length: ', df.all_text_clean.str.split().str.len().mean())
print('stdev text length: ', df.all_text_clean.str.split().str.len().std())
print('max text length: ', df.all_text_clean.str.split().str.len().max())

average text length:  531.0838206627681
stdev text length:  497.18326286157395
max text length:  4931


In [6]:
cols = df.columns
label_cols = list(cols[3:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['chemicals_and_waste_programme', 'climate_change_adaptation_programme', 'climate_change_mitigation_programme', 'climate_strategies_and_policy_programme', 'ecosystems_and_biodiversity_programme', 'energy_program_of_climate_change_mitigation', 'oceans_and_water_programme']


In [7]:
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') # Label counts, may need to downsample or upsample
print('Count of 0 per label: \n', df[label_cols].eq(0).sum())

Count of 1 per label: 
 chemicals_and_waste_programme                   55.0
climate_change_adaptation_programme            180.0
climate_change_mitigation_programme              6.0
climate_strategies_and_policy_programme        222.0
ecosystems_and_biodiversity_programme          312.0
energy_program_of_climate_change_mitigation    184.0
oceans_and_water_programme                      66.0
dtype: float64 

Count of 0 per label: 
 chemicals_and_waste_programme                   971
climate_change_adaptation_programme             846
climate_change_mitigation_programme            1020
climate_strategies_and_policy_programme         804
ecosystems_and_biodiversity_programme           714
energy_program_of_climate_change_mitigation     842
oceans_and_water_programme                      958
dtype: int64


In [8]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows

In [9]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,PIMS_ID,all_text_clean,all_text_clean_spacy,chemicals_and_waste_programme,climate_change_adaptation_programme,climate_change_mitigation_programme,climate_strategies_and_policy_programme,ecosystems_and_biodiversity_programme,energy_program_of_climate_change_mitigation,oceans_and_water_programme,one_hot_labels
0,6252,to strengthen the integration of biodiversity ...,strengthen integration biodiversity conservati...,0,0,0,0,1,0,0.0,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
1,5270,malawi is one of the least electrified countri...,Malawi electrify country SADC region average c...,0,0,0,0,0,1,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,4922,located in central africa burundi is a landloc...,locate Central Africa Burundi landlocked count...,0,1,0,0,0,0,0.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,5851,morocco is an exemplary developing country in ...,Morocco exemplary develop country term complia...,0,0,0,1,0,0,0.0,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
4,5363,sumatra is the sixth largest island in the wor...,Sumatra large island world characterize Bukit ...,0,0,0,0,1,0,0.0,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"


In [10]:
labels = list(df.one_hot_labels.values)
df.dropna()
texts = list(df.all_text_clean.values)




In [11]:
from transformers import LongformerModel
from transformers import LongformerTokenizerFast

config = LongformerConfig.from_pretrained('allenai/longformer-base-4096', num_labels=num_labels) 

# 'sliding_chunks': a PyTorch implementation of our sliding window attention
config.attention_mode = 'sliding_chunks'


tokenizer = RobertaTokenizer.from_pretrained('roberta-base')



In [12]:
#tokenizer.model_max_length = model.config.max_position_embeddings
print (type(texts))
encodings = tokenizer.batch_encode_plus(texts, max_length=5000,truncation=True, pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

<class 'list'>




tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


In [13]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
attention_masks = encodings['attention_mask'] # attention masks

In [14]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [694, 100]


In [15]:
print(len(input_ids))
print(len(one_freq_idxs))

1026
2


In [16]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

In [21]:
# Use train_test_split to split our data into train and validation sets
!pip3 install sklearn
import sklearn
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, labels,attention_masks,
                                                            random_state=2020, test_size=0.20, stratify = labels)

# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)


Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-0.23.2-cp36-cp36m-manylinux1_x86_64.whl (6.8 MB)
[K     |################################| 6.8 MB 15.4 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Collecting scipy>=0.19.1
  Downloading scipy-1.5.4-cp36-cp36m-manylinux1_x86_64.whl (25.9 MB)
[K     |################################| 25.9 MB 11.4 MB/s eta 0:00:01
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=2397 sha256=931ce384df556a9d054f32d13992e35fe60719f923a85ba7d2cfbad93a15675e
  Stored in directory: /root/.cache/pip/wheels/23/9d/42/5ec745cbbb17517000a53cecc49d6a865450d1f5cb16dc8a9c
Successfully built sklearn
Installing collected packages: threadpoolctl, scipy, scikit-learn, sklearn
Successfully installed scikit-learn-0.

In [22]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 128

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [23]:
torch.save(validation_dataloader,'validation_data_loader')
torch.save(train_dataloader,'train_data_loader')

In [24]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
model = LongformerModel.from_pretrained('allenai/longformer-base-4096', config=config)
model.cuda()

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=597257159.0), HTML(value='')))




LongformerModel(
  (embeddings): LongformerEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): LongformerEncoder(
    (layer): ModuleList(
      (0): LongformerLayer(
        (attention): LongformerAttention(
          (self): LongformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (query_global): Linear(in_features=768, out_features=768, bias=True)
            (key_global): Linear(in_features=768, out_features=768, bias=True)
            (value_global): Linear(in_features=768, out_features=768, bias=True)
          )
          (o

In [25]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [26]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train Model

In [27]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0 #running loss
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        print(device)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()

        # # Forward pass for multiclass classification
        # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # loss = outputs[0]
        # logits = outputs[1]

        # Forward pass for multilabel classification
        logits = model(b_input_ids, attention_mask=b_input_mask)[0]
        loss_func = BCEWithLogitsLoss() 
        loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        # loss_func = BCELoss() 
        # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        train_loss_set.append(loss.item())    

        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # scheduler.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    ###############################################################################

      # Validation

      # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

      # Variables to gather full output
    logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

      # Predict
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            outs = model(b_input_ids, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            b_labels = b_labels.to('cpu').numpy()

        tokenized_texts.append(b_input_ids)
        logit_preds.append(b_logit_pred)
        true_labels.append(b_labels)
        pred_labels.append(pred_label)

    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

      # Calculate Accuracy
    threshold = 0.50
    pred_bools = [pl>threshold for pl in pred_labels]
    true_bools = [tl==1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

cuda





RuntimeError: CUDA out of memory. Tried to allocate 1.88 GiB (GPU 0; 7.94 GiB total capacity; 6.21 GiB already allocated; 1.16 GiB free; 6.27 GiB reserved in total by PyTorch)

In [None]:
torch.save(model.state_dict(), 'Longformer_NCE')