# Loading data

In [5]:
import pandas as pd
import numpy
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [12]:
from preprocessor.defines import Patterns

## Gensim helper functions

In [100]:
def get_average_wv(tokens, wv):
    vec_sum = numpy.zeros((1, wv.vector_size))
    n = 0
    for t in tokens:
        if t in wv:
            n+=1
            vec_sum += wv[t]
    return vec_sum/n

In [116]:
def get_concat_wv(tokens, wv):    
    concat = numpy.empty((0, wv.vector_size))
    for t in tokens:
        if t in wv:
            concat = numpy.append(concat, [wv[t]], axis = 0)
        else:
            concat = numpy.append(concat, numpy.zeros((1, wv.vector_size)),axis=0)
    return concat

In [204]:
def get_coverage(tokens, wv):
    in_voc = tokens.apply(lambda x:sum([t in wv for t in x]))
    number = tokens.apply(len)
    return sum(in_voc)/sum(number)

In [None]:
from gensim.models import KeyedVectors

# Wordvecs from Glove

In [122]:
wv = KeyedVectors.load('wordvecs/glove.twitter.27B.100d.kv')

## Preprocessing
Preprocessing reimplemented from Ruby:
https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb

In [185]:
import re
from preprocessor.defines import Patterns

# Different regex parts for smiley faces
eyes = "[8:=;]"
nose = "['`\-]?"

def split_hashtag(m):
    hashtag = m.group()
    hashtag_body = hashtag[1:]
    if hashtag_body.upper() == hashtag_body:
        result = f"_hashtag_ {hashtag_body}"
    else:
        result = "_hashtag_ "+(" ".join([] + re.split("(?=[A-Z])",hashtag_body))).strip()
    return result
            
def glove_cleanup(tweet):
    tweet = re.sub(Patterns.URL_PATTERN,'_url_',tweet)
    tweet = re.sub("/"," / ",tweet)
    tweet = re.sub(Patterns.MENTION_PATTERN,"_user_",tweet)
    tweet = re.sub(f"{eyes}{nose}[)d]+|[(d]+{nose}{eyes}","_smile_",tweet)
    tweet = re.sub(f"{eyes}{nose}p+", "_lolface_",tweet)
    tweet = re.sub(f"{eyes}{nose}\(+|\)+{nose}{eyes}", "_sadface_",tweet)
    tweet = re.sub(f"{eyes}{nose}[\/|l*]", "_neutralface_",tweet)
    tweet = re.sub("<3","_heart_",tweet)
    tweet = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*","_number_",tweet)
    tweet = re.sub("#\S+", split_hashtag, tweet)
    tweet = re.sub(r'([!?.]){2,}',r'\1 _repeat_', tweet)
    tweet = re.sub(r'\b(\S*?)(\S)\2{2,}\b', r'\1\2 _elong_',  tweet)
    tweet = re.sub(r'(\b[A-Z][A-Z]+\b)',r'\1 _allcaps_',tweet)
    tweet = re.sub(r'\s+',' ',tweet) #remove whitespace repetition
    return tweet.lower()

GLOVE_TWITTER_TOKENS = ['url', 'user', 'smile', 'lolface','sadface','neutralface',
                 'heart','number','allcaps','hashtag','repeat','elong']

def reset_glove_token(t):
    for token in GLOVE_TWITTER_TOKENS:
        t = t.replace(f'_{token}_',f'<{token}>')
    return t
        
def reset_glove_tokens(tokens):
    return [reset_glove_token(t) for t in tokens]

## Tokenization
The glove Twitter wordvecs are tokenized using the Stanford tokenizer:
https://nlp.stanford.edu/software/tokenizer.shtml

In [207]:
from nltk.tokenize.destructive import NLTKWordTokenizer #similar tokenizer in Python
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import word_tokenize

In [208]:
#tokenizer = NLTKWordTokenizer()
tokenizer = TreebankWordTokenizer()

In [209]:
text = train['text'].apply(glove_cleanup)
tokens = text.apply(lambda x:tokenizer.tokenize(x))
#tokens = text.apply(word_tokenize)
tokens = tokens.apply(reset_glove_tokens)
tokens

0       [our, deeds, are, the, reason, of, this, <hash...
1          [forest, fire, near, la, ronge, sask., canada]
2       [all, residents, asked, to, 'shelter, in, plac...
3       [<number>, people, receive, <hashtag>, wildfir...
4       [just, got, sent, this, photo, from, ruby, <ha...
                              ...                        
7608    [two, giant, cranes, holding, a, bridge, colla...
7609    [<user>, <user>, the, out, of, control, wild, ...
7610    [m<number>, [, <number>, utc, <allcaps>, ], ?,...
7611    [police, investigating, after, an, e-bike, col...
7612    [the, latest, :, more, homes, razed, by, north...
Name: text, Length: 7613, dtype: object

In [210]:
num_tokens = tokens.apply(len)

In [211]:
num_tokens.describe()

count    7613.000000
mean       17.646920
std         7.087759
min         1.000000
25%        12.000000
50%        18.000000
75%        23.000000
max        56.000000
Name: text, dtype: float64

In [212]:
len(wv.vocab)

1193514

In [213]:
get_coverage(tokens, wv)

0.9464591428103554

# Wordvecs from Godin

In [214]:
import twokenize

In [215]:
wv = KeyedVectors.load('wordvecs/word2vec_twitter_tokens.kv')

In [216]:
tokens = train['text'].apply(twokenize.tokenize)
tokens.head()

0    [Our, Deeds, are, the, Reason, of, this, #eart...
1     [Forest, fire, near, La, Ronge, Sask, ., Canada]
2    [All, residents, asked, to, ', shelter, in, pl...
3    [13,000, people, receive, #wildfires, evacuati...
4    [Just, got, sent, this, photo, from, Ruby, #Al...
Name: text, dtype: object

In [217]:
import re
def replace_in_tokens(tokens, pattern, replacement):
    return tokens.apply(lambda tok: [re.sub(pattern, replacement,t) for t in tok])

tokens = replace_in_tokens(tokens, Patterns.MENTION_PATTERN, '_MENTION_')
tokens = replace_in_tokens(tokens, Patterns.URL_PATTERN, '_URL_')
tokens = replace_in_tokens(tokens, r'[0-9]+', '_NUMBER_')
for i in range(100,105):
    print(train['text'].iloc[i])
    print(tokens.iloc[i])

.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad
['.', '_MENTION_', '#Bahrain', 'police', 'had', 'previously', 'died', 'in', 'a', 'road', 'accident', 'they', 'were', 'not', 'killed', 'by', 'explosion', '_URL_']
I still have not heard Church Leaders of Kenya coming forward to comment on the accident issue and disciplinary measures#ArrestPastorNganga
['I', 'still', 'have', 'not', 'heard', 'Church', 'Leaders', 'of', 'Kenya', 'coming', 'forward', 'to', 'comment', 'on', 'the', 'accident', 'issue', 'and', 'disciplinary', 'measures#ArrestPastorNganga']
@afterShock_DeLo scuf ps live and the game... cya
['_MENTION_', 'scuf', 'ps', 'live', 'and', 'the', 'game', '...', 'cya']
'The man who can drive himself further once the effort gets painful is the man who will win.' 
Roger Bannister
["'", 'The', 'man', 'who', 'can', 'drive', 'himself', 'further', 'once', 'the', 'effort', 'gets', 'painful', 'is', 'the', 'man', 'who', 'wi

In [218]:
num_tokens = tokens.apply(len)

In [219]:
num_tokens.describe()

count    7613.000000
mean       16.341258
std         6.396467
min         1.000000
25%        11.000000
50%        16.000000
75%        21.000000
max        39.000000
Name: text, dtype: float64

In [220]:
len(wv.vocab)

3039345

In [221]:
from itertools import chain
hashtags = train['text'].str.lower().apply(lambda x: Patterns.HASHTAG_PATTERN.findall(x))
name_set = set(chain.from_iterable(hashtags.values))
name_list = list(chain.from_iterable(hashtags.values))
print("Number of different hashtags :", len(name_set))
print("Number :", len(name_list))
names = pd.Series(0, index=list(name_set))
for name in name_list:
    names[name]+=1
print(names.sort_values(ascending=False).head(5))

Number of different hashtags : 1927
Number : 3403
#news        76
#            73
#hot         31
#best        30
#prebreak    30
dtype: int64


In [222]:
sum([name in wv for name in name_list])/len(name_list)

0.748751101968851

In [223]:
sum([name in wv for name in name_set])/len(name_set)

0.6751427088738973

In [224]:
names = names.sort_values(ascending=False)

In [225]:
names = names.reset_index(name='number')
names['oov'] = names['index'].apply(lambda x:x not in wv)
names.head(10)

Unnamed: 0,index,number,oov
0,#news,76,False
1,#,73,False
2,#hot,31,False
3,#best,30,False
4,#prebreak,30,True
5,#nowplaying,23,False
6,#islam,23,False
7,#hiroshima,22,False
8,#earthquake,19,False
9,#gbbo,18,False


In [226]:
get_coverage(tokens, wv)

0.9684259601626931

In [227]:
wv.vector_size

400

In [228]:
avg_wordvecs = tokens.apply(lambda x: get_average_wv(x, wv))

In [229]:
avg_wordvecs[0].shape

(1, 400)

In [230]:
concat_wordvecs = tokens.apply(lambda x: get_concat_wv(x, wv))

In [231]:
concat_wordvecs[0].shape

(13, 400)