In [1]:
%config IPCompleter.greedy=True

In [2]:
%matplotlib inline

### 1. Import packages

In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
path = "./"
save_data_path = path + 'save_data/'
print(os.listdir(path))

['test.tsv', 'tokenization.ipynb', 'train.tsv', 'sampleSubmission.csv', 'save_data', '.ipynb_checkpoints']


### 2. Load Data

In [5]:
data = pd.read_csv(path + "train.tsv", sep='\t')
kaggle_test = pd.read_csv(path + "test.tsv", sep='\t')

In [6]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


#### Show max phrase length of data and kaggle_test

In [7]:
data['Phrase_length'] = data['Phrase'].apply(len)
data['Phrase_length'].max()

283

In [8]:
kaggle_test['Phrase_length'] = data['Phrase'].apply(len)
kaggle_test['Phrase_length'].max()

279

#### So no need to use truncated BPTT

### 3. Tokenizing

In [9]:
import html, re, pickle
import spacy
from fastai.core import listify
from fastprogress import progress_bar

#### Define pre-rules

In [10]:
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [11]:
replace_rep('cccc')

' xxrep 4 c '

In [12]:
replace_wrep('word word word word word ')

' xxwrep 5 word  '

#### Define post-rules

In [13]:
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [14]:
replace_all_caps(['I', 'AM', 'SHOUTING'])

['I', 'xxup', 'am', 'xxup', 'shouting']

In [15]:
deal_caps(['My', 'name', 'is', 'Jeremy'])

['xxmaj', 'my', 'name', 'is', 'xxmaj', 'jeremy']

In [16]:
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor

def parallel(func, arr, max_workers=4):
    if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results

#### Define tokenize processor

In [17]:
def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x

class TokenizeProcessor():
    def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = spacy.blank(lang).tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, args):
        i,chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [18]:
token_processor = TokenizeProcessor()

#### Define indexing processor

In [19]:
import collections
from collections import Counter

class NumericalizeProcessor():
    def __init__(self, vocab=None, max_vocab=60000, min_freq=2): 
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
    
    def __call__(self, items):
        #The vocab is defined on the first use.
        if self.vocab is None:
            freq = Counter(p for o in items for p in o)
            self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
            for o in reversed(default_spec_tok):
                if o in self.vocab: self.vocab.remove(o)
                self.vocab.insert(0, o)
        if getattr(self, 'otoi', None) is None:
            self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)}) 
        return [self.proc1(o) for o in items]
    def proc1(self, item):  return [self.otoi[o] for o in item]
    
    def deprocess(self, idxs):
        assert self.vocab is not None
        return [self.deproc1(idx) for idx in idxs]
    def deproc1(self, idx): return [self.vocab[i] for i in idx]

In [20]:
numericalize_processor = NumericalizeProcessor() 

### Combined process data

In [21]:
tokenized_data = token_processor(data['Phrase'])

In [22]:
numericalized_data = numericalize_processor(tokenized_data)

In [23]:
numericalized_data[0]

[2,
 10,
 341,
 11,
 14246,
 6044,
 8,
 6604,
 19,
 64,
 18,
 58,
 23,
 8,
 2982,
 18,
 191,
 58,
 23,
 8,
 10974,
 9,
 78,
 11,
 91,
 685,
 10173,
 30,
 594,
 11,
 91,
 2129,
 14,
 70,
 11,
 10,
 54,
 15,
 3]

In [24]:
' • '.join(tokenized_data[0])[:400]

'xxbos • a • series • of • escapades • demonstrating • the • adage • that • what • is • good • for • the • goose • is • also • good • for • the • gander • , • some • of • which • occasionally • amuses • but • none • of • which • amounts • to • much • of • a • story • . • xxeos'

In [25]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_length
0,1,1,A series of escapades demonstrating the adage ...,1,188
1,2,1,A series of escapades demonstrating the adage ...,2,77
2,3,1,A series,2,8
3,4,1,A,2,1
4,5,1,series,2,6


In [26]:
data['Tokenized_phrase']= np.asarray(tokenized_data)
data['Indexed_phrase'] = np.asarray(numericalized_data)

In [27]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_length,Tokenized_phrase,Indexed_phrase
0,1,1,A series of escapades demonstrating the adage ...,1,188,"[xxbos, a, series, of, escapades, demonstratin...","[2, 10, 341, 11, 14246, 6044, 8, 6604, 19, 64,..."
1,2,1,A series of escapades demonstrating the adage ...,2,77,"[xxbos, a, series, of, escapades, demonstratin...","[2, 10, 341, 11, 14246, 6044, 8, 6604, 19, 64,..."
2,3,1,A series,2,8,"[xxbos, a, series, xxeos]","[2, 10, 341, 3]"
3,4,1,A,2,1,"[xxbos, a, xxeos]","[2, 10, 3]"
4,5,1,series,2,6,"[xxbos, series, xxeos]","[2, 341, 3]"


##### Check vaid data

In [28]:
len(data['Phrase'])

156060

In [29]:
len(data['Tokenized_phrase'])

156060

In [30]:
len(data['Indexed_phrase'])

156060

### Combined process kaggle_data

In [31]:
tokenized_kaggle_test = token_processor(kaggle_test['Phrase'])

In [32]:
numericalized_kaggle_test = numericalize_processor(tokenized_kaggle_test)

In [33]:
numericalized_kaggle_test[0]

[2, 7, 26, 2606, 1723, 30, 632, 1041, 409, 15, 3]

In [34]:
' • '.join(tokenized_kaggle_test[0])[:400]

'xxbos • xxmaj • an • intermittently • pleasing • but • mostly • routine • effort • . • xxeos'

In [35]:
kaggle_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Phrase_length
0,156061,8545,An intermittently pleasing but mostly routine ...,188
1,156062,8545,An intermittently pleasing but mostly routine ...,77
2,156063,8545,An,8
3,156064,8545,intermittently pleasing but mostly routine effort,1
4,156065,8545,intermittently pleasing but mostly routine,6


In [36]:
kaggle_test['Tokenized_phrase']= np.asarray(tokenized_kaggle_test)
kaggle_test['Indexed_phrase'] = np.asarray(numericalized_kaggle_test)

##### Check valid kaggle_test

In [37]:
len(kaggle_test['Phrase'])

66292

In [38]:
len(kaggle_test['Tokenized_phrase'])

66292

In [39]:
len(kaggle_test['Indexed_phrase'])

66292

In [40]:
kaggle_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Phrase_length,Tokenized_phrase,Indexed_phrase
0,156061,8545,An intermittently pleasing but mostly routine ...,188,"[xxbos, xxmaj, an, intermittently, pleasing, b...","[2, 7, 26, 2606, 1723, 30, 632, 1041, 409, 15, 3]"
1,156062,8545,An intermittently pleasing but mostly routine ...,77,"[xxbos, xxmaj, an, intermittently, pleasing, b...","[2, 7, 26, 2606, 1723, 30, 632, 1041, 409, 3]"
2,156063,8545,An,8,"[xxbos, xxmaj, an, xxeos]","[2, 7, 26, 3]"
3,156064,8545,intermittently pleasing but mostly routine effort,1,"[xxbos, intermittently, pleasing, but, mostly,...","[2, 2606, 1723, 30, 632, 1041, 409, 3]"
4,156065,8545,intermittently pleasing but mostly routine,6,"[xxbos, intermittently, pleasing, but, mostly,...","[2, 2606, 1723, 30, 632, 1041, 3]"


## 4. Dump data

In [41]:
pickle.dump(data, open(save_data_path + 'pre-processed-data.pkl', 'wb'))
pickle.dump(kaggle_test, open(save_data_path + 'pre-processed-kaggle-test.pkl', 'wb'))
pickle.dump(numericalize_processor.vocab, open(save_data_path + 'genereated-vocab.pkl', 'wb'))

In [42]:
loaded_data = pickle.load(open(save_data_path + 'pre-processed-data.pkl', 'rb'))
loaded_kaggle_test = pickle.load(open(save_data_path + 'pre-processed-kaggle-test.pkl', 'rb'))
loaded_vocab = pickle.load(open(save_data_path + 'genereated-vocab.pkl', 'rb'))

In [43]:
loaded_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_length,Tokenized_phrase,Indexed_phrase
0,1,1,A series of escapades demonstrating the adage ...,1,188,"[xxbos, a, series, of, escapades, demonstratin...","[2, 10, 341, 11, 14246, 6044, 8, 6604, 19, 64,..."
1,2,1,A series of escapades demonstrating the adage ...,2,77,"[xxbos, a, series, of, escapades, demonstratin...","[2, 10, 341, 11, 14246, 6044, 8, 6604, 19, 64,..."
2,3,1,A series,2,8,"[xxbos, a, series, xxeos]","[2, 10, 341, 3]"
3,4,1,A,2,1,"[xxbos, a, xxeos]","[2, 10, 3]"
4,5,1,series,2,6,"[xxbos, series, xxeos]","[2, 341, 3]"


In [44]:
loaded_kaggle_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Phrase_length,Tokenized_phrase,Indexed_phrase
0,156061,8545,An intermittently pleasing but mostly routine ...,188,"[xxbos, xxmaj, an, intermittently, pleasing, b...","[2, 7, 26, 2606, 1723, 30, 632, 1041, 409, 15, 3]"
1,156062,8545,An intermittently pleasing but mostly routine ...,77,"[xxbos, xxmaj, an, intermittently, pleasing, b...","[2, 7, 26, 2606, 1723, 30, 632, 1041, 409, 3]"
2,156063,8545,An,8,"[xxbos, xxmaj, an, xxeos]","[2, 7, 26, 3]"
3,156064,8545,intermittently pleasing but mostly routine effort,1,"[xxbos, intermittently, pleasing, but, mostly,...","[2, 2606, 1723, 30, 632, 1041, 409, 3]"
4,156065,8545,intermittently pleasing but mostly routine,6,"[xxbos, intermittently, pleasing, but, mostly,...","[2, 2606, 1723, 30, 632, 1041, 3]"


In [45]:
loaded_vocab[0:100]

['xxunk',
 'xxpad',
 'xxbos',
 'xxeos',
 'xxrep',
 'xxwrep',
 'xxup',
 'xxmaj',
 'the',
 ',',
 'a',
 'of',
 'and',
 '-',
 'to',
 '.',
 "'s",
 'in',
 'is',
 'that',
 'it',
 'as',
 'with',
 'for',
 'its',
 'film',
 'an',
 'movie',
 'this',
 '`',
 'but',
 'be',
 'on',
 'you',
 'by',
 "n't",
 "'",
 'more',
 'his',
 'one',
 'about',
 'not',
 'at',
 'or',
 'than',
 'from',
 'all',
 '--',
 'like',
 'have',
 'are',
 'has',
 'so',
 'out',
 'story',
 '-rrb-',
 'up',
 'who',
 'good',
 'too',
 'most',
 'into',
 '-lrb-',
 'if',
 'what',
 'time',
 'their',
 'no',
 '...',
 'characters',
 'much',
 "''",
 'comedy',
 'your',
 'i',
 'can',
 'just',
 'life',
 'some',
 'does',
 'even',
 'little',
 'funny',
 'will',
 'well',
 'way',
 'very',
 'been',
 'any',
 'make',
 'only',
 'which',
 'he',
 'movies',
 'director',
 'love',
 'do',
 'new',
 'bad',
 'there']