In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *

In [3]:
path = Path('.')

In [4]:
import re
import ftfy
import json
import spacy

from tqdm import tqdm

def get_pairs(word):
    """
    Return set of symbol pairs in a word.
    word is represented as tuple of symbols (symbols being variable-length strings)
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

def text_standardize(text):
    """
    fixes some issues the spacy tokenizer had on books corpus
    also does some whitespace standardization
    """
    text = text.replace('—', '-')
    text = text.replace('–', '-')
    text = text.replace('―', '-')
    text = text.replace('…', '...')
    text = text.replace('´', "'")
    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
    text = re.sub(r'\s*\n\s*', ' \n ', text)
    text = re.sub(r'[^\S\n]+', ' ', text)
    return text.strip()

pre_rules = [ftfy.fix_text, text_standardize]

In [5]:
class TextEncoder(object):
    """
    mostly a wrapper for a public python bpe tokenizer
    """

    def __init__(self, encoder_path, bpe_path):
        self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
        self.encoder = json.load(open(encoder_path))
        self.decoder = {v:k for k,v in self.encoder.items()}
        merges = open(bpe_path, encoding='utf-8').read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

    def bpe(self, token):
        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
        if token in self.cache:
            return self.cache[token]
        pairs = get_pairs(word)

        if not pairs:
            return token+'</w>'

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        if word == '\n  </w>':
            word = '\n</w>'
        self.cache[token] = word
        return word

    def encode(self, texts, verbose=True):
        texts_tokens = []
        if verbose:
            for text in tqdm(texts, ncols=80, leave=False):
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        else:
            for text in texts:
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens = []
                for token in text:
                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
                texts_tokens.append(text_tokens)
        return texts_tokens

In [6]:
text_encoder = TextEncoder('models/encoder_bpe_40000.json', 'models/vocab_40000.bpe')

In [7]:
def bpe(toks, text_encoder):
    text_tokens = []
    for tok in toks[1:]:
        text_tokens.extend(text_encoder.bpe(tok.lower()).split(' '))
    return [BOS] + text_tokens + ['_classify_']

In [8]:
post_rules = [partial(bpe, text_encoder=text_encoder)]

In [9]:
tokenizer = Tokenizer(pre_rules=pre_rules, post_rules=post_rules)

In [10]:
stoi = text_encoder.encoder

In [11]:
itos = [k for k,v in stoi.items()]
itos.insert(0, '_classify_')
itos.insert(0, BOS)
itos.insert(0, UNK)

In [12]:
itos[:10]

['xxunk', 'xxbos', '_classify_', '.', ',', 't', 'h', 'e', '"', 'o']

In [13]:
vocab = Vocab(itos)

In [14]:
#data_lm = TextLMDataBunch.from_csv(path, 'Sarcasm_Headlines_Dataset.csv')
data_clas = TextClasDataBunch.from_csv(path, 'Sarcasm_Headlines_Dataset.csv', 
                                       tokenizer = tokenizer, vocab = vocab, bs=32)

In [15]:
data_clas.show_batch()

text,target
"'</w> 12</w> years</w> a</w> slave</w> ,</w> '</w> '</w> captain</w> phillips</w> ,</w> '</w> '</w> american</w> hustle</w> ,</w> '</w> '</w> wolf</w> of</w> wall</w> street</w> ,</w> '</w> '</w> blue</w> jasmine</w> ,</w> '</w> '</w> dallas</w> buyers</w> club</w> ,</w> '</w> '</w> her</w> ,</w> '</w> '</w> nebraska</w> ,</w> '</w> '</w> before</w> midnight</w> ,</w> '</w> and</w> '</w> philo men a</w> '</w> all</w> written</w> during</w> same</w> continuing</w> education</w> screen writing</w> class</w> _classify_",1
"maya</w> angel ou</w> ,</w> poet</w> ,</w> author</w> ,</w> civil</w> rights</w> acti vist</w> ,</w> and</w> -</w> holy</w> cow</w> -</w> tony</w> award</w> -</w> nominated</w> actress</w> ,</w> college</w> professor</w> ,</w> magazine</w> editor</w> ,</w> street car</w> conductor</w> -</w> really</w> ?</w> street car</w> conductor</w> ?</w> wow</w> -</w> calypso</w> singer</w> ,</w> nightclub</w> performer</w> ,</w> and</w> foreign</w> journalist</w> ,</w> dead</w> at</w> 86</w> _classify_",1
"occasionally</w> you</w> realize</w> someone</w> you</w> thought</w> was</w> a</w> dear</w> friend</w> is</w> actually</w> a</w> foe</w> ,</w> their</w> true</w> character</w> finally</w> revealed</w> .</w> but</w> how</w> do</w> you</w> forgive</w> the</w> unforgivable</w> ?</w> here</w> are</w> my</w> 10</w> steps</w> to</w> handling</w> betrayal</w> with</w> elegance</w> and</w> grace</w> .</w> _classify_",0
navy</w> dis continues</w> use</w> of</w> '</w> port</w> '</w> and</w> '</w> star boar d' will</w> now</w> refer</w> to</w> left</w> as</w> '</w> thunk</w> '</w> and</w> right</w> as</w> '</w> moo sh</w> -</w> bar oo</w> '</w> _classify_,1
a</w> labor</w> day</w> documentary</w> :</w> '</w> brothers</w> on</w> the</w> line</w> '</w> tells</w> the</w> story</w> of</w> the</w> reu ther</w> brothers</w> --</w> founding</w> fathers</w> of</w> the</w> american</w> middle</w> class</w> _classify_,0


In [16]:
learn = text_classifier_learner(data_clas, Transformer, drop_mult=0.5)

In [17]:
learn.unfreeze()
learn.fit_one_cycle(3, 6.25e-5)

epoch,train_loss,valid_loss,accuracy
1,0.490509,0.515282,0.767690
2,0.427520,0.516817,0.777237
3,0.372072,0.395693,0.822538
