In [1]:
import warnings

import numpy as np
import pandas as pd

import spacy

from tqdm import tqdm

warnings.filterwarnings('ignore')

tqdm.pandas()

In [2]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')

train.dropna(inplace=True)
train = train[train['textID'] != '12f21c8f19'] # no selected_text

In [3]:
class Preprocesser:
    '''
    A class to clean data and bring it into a format which is usable for training
    '''

    @staticmethod
    def count_extra_spaces(text):
        return len(text) - len(' '.join(text.split()))


    @staticmethod
    def has_extra_spaces(text):
        return Preprocesser.count_extra_spaces(text) > 0

    
    @staticmethod
    def realign_selected_text(text, selected_text):
        '''
        assuming the first part of `selected_text` is the cause of corruption,
        realign the `selected_text` to form new labels
        '''

        # divide the string into two
        # string before selected text and string after selected text
        split = text.split(selected_text)

        # eliminate any extra space before the split
        if split[0].endswith(' '):
            split = text.split(' ' + selected_text)


        # count the number of extra spaces in the first part
        # with no extra space at the end
        extra_spaces = Preprocesser.count_extra_spaces(split[0])

        # One extra space
        # At the start
        if extra_spaces == 1 and text.startswith(' '):
            split = text.split(selected_text)

            extra_space_at_end = split[0] and split[0].endswith(' ')

            # start index when the extra spaces at the begining is removed
            if extra_space_at_end:
                start = text.find(selected_text) - 1
            else:
                start = text.find(selected_text)

        else: # extra_spaces > 1
            start = text.find(selected_text)


        split = text.split(selected_text)

        extra_space_at_end = split[0] and split[0].endswith(' ')

        # eliminate all extra spaces in first split
        text_preprocessed = ' '.join(split[0].split())
        if extra_space_at_end:
            text_preprocessed += ' '

        text_preprocessed += selected_text
        text_preprocessed += split[1]


        if extra_spaces > 1 and text.startswith(' '):
            end = start + len(selected_text) - 1
        else:
            end = start + len(selected_text)

        new_selected_text = text_preprocessed[start: end]
        

        return new_selected_text


    @staticmethod
    def preprocess_selected_text(text, selected_text):
        '''
        assuming that the corrupted text is always from the first part, 
        find the corrupted text and realign them to form new labels
        '''
        split = text.split(selected_text)

        first_split_ends_in_space = split[0] and split[0].endswith(' ')
        extra_spaces_in_first_split = Preprocesser.has_extra_spaces(split[0])
        extra_spaces_in_selected_text = Preprocesser.has_extra_spaces(selected_text)
        text_and_selected_text_content_same = text.strip() == selected_text.strip()
        selected_text_starts_at_index_zero = (text.find(selected_text) == 0)

        conditions = [
            # first part of the split
            extra_spaces_in_first_split, # there are extra spaces in the first split
            not first_split_ends_in_space, # which is not at the end

            # if text and selected text are same there is no reason to realign
            not text_and_selected_text_content_same,
            
            # if selected_text starts at index zero there is no reason to realign
            not selected_text_starts_at_index_zero,
            
            # preserve the extra spaces as these same extra spaces will be in the tweet
            not extra_spaces_in_selected_text,
        ]

        if all(conditions):
            return Preprocesser.realign_selected_text(text, selected_text)
        else:
            return selected_text

        
    @staticmethod
    def make_training_sample(data):
        '''
        create a (X, y) pair to feed into spacy model
        '''
        text = data['text']
        selected_text = data['selected_text']
        sentiment = data['sentiment']

        start_idx = text.find(selected_text)
        end_idx = start_idx + len(selected_text)

        sample = (text, {'entities': [(start_idx, end_idx, sentiment)]})
        return sample

In [4]:
train['new_selected_text'] = train.progress_apply(lambda x: Preprocesser.preprocess_selected_text(x['text'], x['selected_text']), axis=1)

100%|██████████| 27479/27479 [00:00<00:00, 42290.08it/s]


In [5]:
train[train['new_selected_text'] != train['selected_text']]

Unnamed: 0,textID,text,selected_text,sentiment,new_selected_text
18,af3fed7fc3,is back home now gonna miss every one,onna,negative,miss
49,3fcea4debc,which case? I got a new one last week and I`m...,d I`m not thrilled at all with mine.,negative,I`m not thrilled at all with mine.
66,95e12b1cb1,He`s awesome... Have you worked with him bef...,s awesome,positive,awesome.
84,4cd390c007,Still no reply from about my SimFinger proble...,", sorry guys",negative,sorry guys
129,94f67cfa6d,hey mia! totally adore your music. when wil...,y adore,positive,adore
...,...,...,...,...,...
27426,132e051fe8,my cousins moved there like 2 years ago and ...,m sad,negative,sad
27429,bb7af2712a,"I`ve heard of disgruntled investors, but call...",? That seems totally out of line,negative,That seems totally out of line
27470,778184dff1,lol i know and haha..did you fall asleep?? o...,t bored,negative,bored
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,lost


In [6]:
train['selected_text'] = train['new_selected_text']
ner_data = train.progress_apply(Preprocesser.make_training_sample, axis=1)
data = ner_data.values

100%|██████████| 27479/27479 [00:00<00:00, 37312.87it/s]


In [7]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return len(c) / (len(a) + len(b) - len(c))

In [8]:
import os


class NERModel:
    def train(self, data, n_iters=10, drop=0.3):

        nlp = spacy.blank('en')
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)

        # create training data for one single model
        ner.add_label('positive')
        ner.add_label('negative')
        ner.add_label('neutral')

        nlp.begin_training()
        for i in range(n_iters):
            np.random.shuffle(data)
            losses = {}

            batches = spacy.util.minibatch(data)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts, # X
                    annotations, # y
                    drop=drop, # dropout rate
                    losses=losses,
                )

            print(f'iter {i} loss: {losses["ner"]:.4f}')
            
        self.nlp = nlp    
    
    
    def save(self, name):
        output_dir = f'../working/ner_models/{name}'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        self.nlp.meta["name"] = name
        self.nlp.to_disk(output_dir)

In [9]:
%%time

N_ITERS = 100

model = NERModel()

model.train(data, N_ITERS)

model.save('spacy')

iter 0 loss: 130548.3694
iter 1 loss: 119804.3663
iter 2 loss: 113964.0439
iter 3 loss: 109398.0654
iter 4 loss: 106095.3567
iter 5 loss: 102103.2688
iter 6 loss: 100614.2045
iter 7 loss: 97688.8182
iter 8 loss: 94529.4113
iter 9 loss: 92153.4937
iter 10 loss: 90307.0151
iter 11 loss: 88673.7047
iter 12 loss: 86695.2122
iter 13 loss: 84537.6339
iter 14 loss: 83535.1756
iter 15 loss: 82489.8865
iter 16 loss: 80063.2888
iter 17 loss: 78424.4580
iter 18 loss: 78365.4268
iter 19 loss: 76731.3981
iter 20 loss: 76172.3109
iter 21 loss: 76307.9908
iter 22 loss: 74180.3136
iter 23 loss: 73737.3044
iter 24 loss: 72109.4942
iter 25 loss: 72356.4876
iter 26 loss: 71012.6603
iter 27 loss: 71171.7216
iter 28 loss: 69656.3565
iter 29 loss: 68655.1873
iter 30 loss: 69132.1826
iter 31 loss: 69019.0947
iter 32 loss: 67535.4888
iter 33 loss: 66937.3337
iter 34 loss: 65876.2488
iter 35 loss: 65834.8638
iter 36 loss: 65686.2471
iter 37 loss: 65782.1538
iter 38 loss: 65007.9704
iter 39 loss: 64423.8645
ite

In [10]:
def predict(nlp, texts):
    docs = nlp.pipe(texts)
    preds = []
    for i, doc in enumerate(docs):
        if doc.ents:
            text = doc.ents[0].text
            preds.append(text)
        else:
            preds.append(texts[i])
    return preds


def create_text_selection_pairs(data):
    '''
    create (X, y) pair; X to be fed into the model to obtain predictions
    and y as ground truth texts
    '''
    texts, selections = [], []
    for row in data:
        text = row[0]
        texts.append(text)
        start, end, _ = row[1].get('entities')[0]
        selections.append(text[start: end])
    return texts, selections

In [11]:
texts, selections = create_text_selection_pairs(data)
nlp = spacy.load('./ner_models/spacy')
preds = predict(nlp, texts)

In [12]:
scores = []
for str1, str2 in zip(selections, preds):
    scores.append(jaccard(str1, str2))
    
print(f'Mean jaccard score: {np.mean(scores):.4f}')

Mean jaccard score: 0.7425
