In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Import libraries
import os
import re
import string

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tokenizers
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from transformers import *

In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tf-roberta/pretrained-roberta-base.h5
/kaggle/input/tf-roberta/merges-roberta-base.txt
/kaggle/input/tf-roberta/config-roberta-base.json
/kaggle/input/tf-roberta/vocab-roberta-base.json
/kaggle/input/tweet-sentiment-extraction/sample_submission.csv
/kaggle/input/tweet-sentiment-extraction/test.csv
/kaggle/input/tweet-sentiment-extraction/train.csv


# Data Pre-processing and Transformation

In [3]:
train = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")

In [4]:
print("Training set has {} data points".format(len(train)))
print("Testing set has {} data points".format(len(test)))

Training set has 27481 data points
Testing set has 3534 data points


In [5]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


### Check for NaN values

In [7]:
train.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [8]:
test.isna().sum()

textID       0
text         0
sentiment    0
dtype: int64

In [9]:
# Since there is only one NaN value, let's drop it

# Dropping it in the TweetDataset class below
# train = train.dropna(axis=0).reset_index(drop=True)

In [10]:
print("Training set has {} data points".format(len(train)))
print("Testing set has {} data points".format(len(test)))
train.isna().sum()

Training set has 27481 data points
Testing set has 3534 data points


textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

### Removing punctuations & stopwords, or not? 

In [11]:
# Checking if punctuation appears in selected_text
selected_text_has_punctuation = train.selected_text.str.extract(
                                                        r'([{}]+)'.format(
                                                            re.escape(
                                                                string.punctuation)))
# number of selected_text with punctuations
selected_text_has_punctuation.isna().sum() 

0    10982
dtype: int64

In [12]:
# observing some tweets whose selected_text contain punctuations 
train.loc[selected_text_has_punctuation.dropna().index].head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive
10,2339a9b08b,"as much as i love to be hopeful, i reckon the...","as much as i love to be hopeful, i reckon the ...",neutral


- Punctuation seems to appear in quite a lot of our extracted examples. I'll not remove punctuations for this dataset.
- Also, I need to preserve stopwords as it can be seen in the above `neutral` sentiment that the tweet *text* has been extracted as-is in the *selected_text*. 

### Deciding max *text* length

In [13]:
train.text.str.len().max()

141.0

In [14]:
MAX_LEN = 148

### Tokenizer

The pretrained RoBERTa model and tokenizer are from huggingface [transformers](https://huggingface.co/transformers/main_classes/model.html?highlight=save_pretrained) library. They can be downloaded by using the `from_pretrained()` method or attached to a kaggle kerned from [here](https://www.kaggle.com/cdeotte/tf-roberta)

In [15]:
class TweetDataset:
    def __init__(self, data_df, tokenizer, train=True, max_len=96):
        self.data = data_df.dropna(axis=0).reset_index(drop=True)
        self.is_train = True if train else False
        self.sentiment_tokens = {
            'positive': tokenizer.encode('positive').ids[0], 
            'negative': tokenizer.encode('negative').ids[0],
            'neutral': tokenizer.encode('neutral').ids[0]
        }
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def ByteLevelBPEPreprocessor(self, text, selected_text, sentiment):
        """Return Input IDs, Attention Mask, Start/End tokens
        
        This function returns Input IDs and Attention Mask. If this is
        training dataset it also return start and end tokens.
        """
        text = " " + " ".join(text.split())
        enc = self.tokenizer.encode(text)
        s_tok = self.sentiment_tokens[sentiment]
        
        # Get InputIDs
        input_ids = np.ones((self.max_len),
                            dtype = 'int32')
        input_ids[:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]

        # Get Attention mask
        attention_mask = np.zeros((self.max_len),
                                  dtype='int32')
        attention_mask[:len(enc.ids)+5] = 1
        
        if self.is_train:
            selected_text = " ".join(selected_text.split())
            idx = text.find(selected_text)
            char_tokens = np.zeros((len(text)))
            char_tokens[idx:idx+len(selected_text)] = 1
            # if text has ' ' prefix
            if text[idx-1] == ' ': 
                char_tokens[idx-1] = 1
                
            # Get start and end token for selected_text in input IDs
            start_tokens = np.zeros((self.max_len),
                                    dtype='int32')
            end_tokens = np.zeros((self.max_len),
                                  dtype='int32')
            ptr_idx = 0
            label_idx = list()
            for i, enc_id in enumerate(enc.ids):
                sub_word = self.tokenizer.decode([enc_id])
                if sum(char_tokens[ptr_idx:ptr_idx+len(sub_word)]) > 0:
                    label_idx.append(i)
                ptr_idx += len(sub_word)
            if label_idx:
                # + 1 as we added prefix before
                start_tokens[label_idx[0] + 1] = 1
                end_tokens[label_idx[-1] + 1] = 1
            return input_ids, attention_mask, start_tokens, end_tokens
        
        return input_ids, attention_mask
            
    def __call__(self):
        data_len = len(self.data)
        input_ids = np.ones((data_len, self.max_len), 
                            dtype='int32')
        attention_mask = np.zeros((data_len, self.max_len), 
                                  dtype='int32')
        token_type_ids = np.zeros((data_len, self.max_len),
                                  dtype='int32')
        if self.is_train:
            start_tokens = np.zeros((data_len, self.max_len),
                                    dtype='int32')
            end_tokens = np.zeros((data_len, self.max_len),
                                  dtype='int32')
        for i, row in tqdm(self.data.iterrows(), total=len(self.data)):
            out = self.ByteLevelBPEPreprocessor(
                row['text'], 
                row['selected_text'] if self.is_train else None, 
                row['sentiment']
            )
            if self.is_train:
                input_ids[i], attention_mask[i], start_tokens[i], end_tokens[i] = out
            else:
                input_ids[i], attention_mask[i] = out
        if self.is_train:
            return input_ids, attention_mask, token_type_ids, start_tokens, end_tokens
        return input_ids, attention_mask, token_type_ids

In [16]:
class TransformerQA:
    def __init__(self, max_len, model_path, tokenizer, fit=True):
        self.max_len = max_len
        self.model_path = model_path
        self.tokenizer = tokenizer
        
    def roberta_model(self):
        """Return RoBERTa base mode with a custom question answering head
        """
        input_ids = tf.keras.layers.Input((self.max_len,),
                                          dtype=tf.int32)
        attention_mask = tf.keras.layers.Input((self.max_len,),
                                               dtype=tf.int32)
        token_type_ids = tf.keras.layers.Input((self.max_len,),
                                               dtype=tf.int32)

        config = RobertaConfig.from_pretrained(
            os.path.join(self.model_path, 'config-roberta-base.json')
        )
        roberta_model = TFRobertaModel.from_pretrained(
            os.path.join(self.model_path, 'pretrained-roberta-base.h5'),
            config=config
        )
        x = roberta_model(inputs=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids)

        x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
        x1 = tf.keras.layers.Conv1D(1,1)(x1)
        x1 = tf.keras.layers.Flatten()(x1)
        x1 = tf.keras.layers.Activation('softmax')(x1)

        x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
        x2 = tf.keras.layers.Conv1D(1,1)(x2)
        x2 = tf.keras.layers.Flatten()(x2)
        x2 = tf.keras.layers.Activation('softmax')(x2)

        model = tf.keras.models.Model(
            inputs=[input_ids, attention_mask, token_type_ids], 
            outputs=[x1,x2]
        )
        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
        model.compile(loss='categorical_crossentropy', optimizer=optimizer)

        return model
    
    def jaccard(self, str1, str2):
        """Return Jaccard similarity score betweeen two strings
        """
        a = set(str1.lower().split()) 
        b = set(str2.lower().split())
        if (len(a)==0) & (len(b)==0): return 0.5
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    
    def get_model_selected_text(self, data_df, preds_start, preds_end):
        """Return list of 'selected_text using the predicted start/end tokens'
        """
        st_list = []
        for k in range(len(data_df)):
            idx_start = np.argmax(preds_start[k,])
            idx_end = np.argmax(preds_end[k,])
            if idx_start > idx_end:
                st = data_df.loc[k,'text']
#                 if data_df.loc[k, 'sentiment'] != 'neutral':
#                     st = st.split()[idx_start]
            else:
                text = " " + " ".join(data_df.loc[k,'text'].split())
                enc = self.tokenizer.encode(text)
                st = self.tokenizer.decode(enc.ids[idx_start-1:idx_end])
            st_list.append(st)
        return st_list
    
    def fit(self, train_df, input_ids, attention_mask, 
            token_type_ids, start_tokens, end_tokens, 
            stratify_y, VER='v0', verbose=1):
        """Fit a RoBERTa model with the training dataset
        """
        avg_score = []
        oof_start = np.zeros((input_ids.shape[0],
                              self.max_len))
        oof_end = np.zeros((input_ids.shape[0],
                            self.max_len))
        skf = StratifiedKFold(n_splits=5,
                              shuffle=True,
                              random_state=42)
        for fold, (idxT,idxV) in enumerate(skf.split(input_ids,
                                                     stratify_y)):
            print('Training FOLD {}:'.format(fold+1))
            K.clear_session()
            model = self.roberta_model()
            sv = tf.keras.callbacks.ModelCheckpoint(
                '{}-roberta-{}.h5'.format(VER, fold), 
                monitor='val_loss', 
                verbose=verbose, 
                save_best_only=True,
                save_weights_only=True, 
                mode='auto', 
                save_freq='epoch'
            )
            model.fit([input_ids[idxT,], 
                       attention_mask[idxT,], 
                       token_type_ids[idxT,]], 
                      [start_tokens[idxT,], end_tokens[idxT,]],
                      epochs=3, 
                      batch_size=32, 
                      verbose=verbose, 
                      callbacks=[sv],
                      validation_data=(
                          [
                              input_ids[idxV,],
                              attention_mask[idxV,],
                              token_type_ids[idxV,]
                          ], 
                          [start_tokens[idxV,], end_tokens[idxV,]]
                      )
                     )
            # Load best saved model from disk
            print('Loading model...')
            model.load_weights('{}-roberta-{}.h5'.format(VER, fold))
            
            # Predicting OOF samples
            print('Predicting OOF...')
            oof_start[idxV,],oof_end[idxV,] = model.predict(
                [
                    input_ids[idxV,],
                    attention_mask[idxV,],
                    token_type_ids[idxV,]
                ],
                verbose=verbose
            )
            
            pred_df = train_df.loc[idxV].reset_index(drop=True)
            pred_df['oof_st'] = self.get_model_selected_text(
                data_df=pred_df,
                preds_start=oof_start[idxV,],
                preds_end=oof_end[idxV,]
            )
            fold_val_score = pred_df.apply(
                lambda x: self.jaccard(x['selected_text'], 
                                       x['oof_st']
                                      ),
                axis=1
            ).mean()
            avg_score.append(fold_val_score)
            print('>>>> FOLD {} Jaccard score = {}'.format(fold+1, 
                                                           fold_val_score))
    def predict(self, pred_df, input_ids, attention_mask, 
                token_type_ids, n_models, VER='v0', verbose=1):
        """Return a list of predicted 'selected_text' by loading saved models
        """
        preds_start = np.zeros((input_ids.shape[0],
                                self.max_len))
        preds_end = np.zeros((input_ids.shape[0],
                              self.max_len))
        for i in range(n_models):
            K.clear_session()
            model = self.roberta_model()
            
            print('Loading model...')
            model.load_weights('{}-roberta-{}.h5'.format(VER, i))
            
            preds = model.predict(
                [input_ids, attention_mask, token_type_ids],
                verbose=verbose
            )
            preds_start += preds[0]/n_models
            preds_end += preds[1]/n_models
        
        test_st = self.get_model_selected_text(
            data_df=pred_df,
            preds_start=preds_start,
            preds_end=preds_end
        )
        return test_st

In [17]:
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)

In [18]:
# Get pre-processed & transformed training inputs and labels

train_data = TweetDataset(train, tokenizer, train=True, max_len=MAX_LEN)
input_ids, attention_mask, token_type_ids, start_tokens, end_tokens = train_data()

HBox(children=(FloatProgress(value=0.0, max=27480.0), HTML(value='')))




In [19]:
# Get pre-processed & transformed testing inputs

test_data = TweetDataset(test, tokenizer, train=False, max_len=MAX_LEN)
test_input_ids, test_attention_mask, test_token_type_ids = test_data()

HBox(children=(FloatProgress(value=0.0, max=3534.0), HTML(value='')))




In [20]:
QA_model = TransformerQA(max_len=MAX_LEN, 
                         model_path=PATH, 
                         tokenizer=tokenizer)

In [21]:
# Train the RoBERTa model
QA_model.fit(train_df=train_data.data, 
             input_ids=input_ids, 
             attention_mask=attention_mask, 
             token_type_ids=token_type_ids, 
             start_tokens=start_tokens, 
             end_tokens=end_tokens, 
             stratify_y=train_data.data.sentiment.values)

Training FOLD 1:
Train on 21984 samples, validate on 5496 samples
Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.75551, saving model to v0-roberta-0.h5
Epoch 2/3
Epoch 00002: val_loss improved from 1.75551 to 1.65791, saving model to v0-roberta-0.h5
Epoch 3/3
Epoch 00003: val_loss did not improve from 1.65791
Loading model...
Predicting OOF...
>>>> FOLD 1 Jaccard score = 0.7031862155715445
Training FOLD 2:
Train on 21984 samples, validate on 5496 samples
Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.62839, saving model to v0-roberta-1.h5
Epoch 2/3
Epoch 00002: val_loss did not improve from 1.62839
Epoch 3/3
Epoch 00003: val_loss improved from 1.62839 to 1.59848, saving model to v0-roberta-1.h5
Loading model...
Predicting OOF...
>>>> FOLD 2 Jaccard score = 0.707532562130843
Training FOLD 3:
Train on 21984 samples, validate on 5496 samples
Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.67778, saving model to v0-roberta-2.h5
Epoch 2/3
Epoch 00002: val_loss improve

In [22]:
# Test the RoBERTa model

test['selected_text'] = QA_model.predict(pred_df=test, 
                                         input_ids=test_input_ids, 
                                         attention_mask=test_attention_mask, 
                                         token_type_ids=test_token_type_ids, 
                                         n_models=5)

Loading model...
Loading model...
Loading model...
Loading model...
Loading model...


In [23]:
test[['textID','selected_text']].to_csv('submission.csv',index=False)

In [24]:
# Show 25 random predicted 'selected_text'
test.sample(25)

Unnamed: 0,textID,text,sentiment,selected_text
2094,d60072b03a,He can`t fix it. I guess I`ll write until I ...,negative,lame.
1176,3cb4d10927,Watching WALL-E.....it`s so cute but sad,neutral,watching wall-e.....it`s so cute but sad
2774,5fb30f858b,"Probably not, kinda expensive and we have to ...",negative,expensive
480,819626535b,"Glad to hear you made it out, I hear that pla...",positive,glad
3189,1a857a38fb,Weekend is getting close. Too bad I`ll be stuc...,positive,hopefully i`ll be able to get out next weeken...
2246,676b733c57,Buying pretty shiny beads and things I feel q...,neutral,buying pretty shiny beads and things i feel q...
2418,49c713c76d,Is It The Bit Where Hollie Started Crying?,neutral,is it the bit where hollie started crying?
1184,d2141d6d47,Thats great,positive,thats great
907,b3fa6e5c24,I always hope they will die out but then i se...,negative,sad
2181,8f851c59f6,- halla!!! doing ok- got a cold but trying to...,neutral,- halla!!! doing ok- got a cold but trying to...
