# Generate wordvecs for the tweets

## Loading data

In [31]:
import pandas as pd
import numpy
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,glove_cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,our deeds are the reason of this <hashtag> ear...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,all residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,<number> people receive <hashtag> wildfires ev...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,just got sent this photo from ruby <hashtag> a...


## Add wordvecs

In [32]:
MODEL = "wordvecs/glove.twitter.27B.25d"
MODEL_NAME = "glove.twitter.27B.25d"
if MODEL_NAME.startswith('glove.'):
    TEXT_COL = 'glove_cleaned_text'
else:
    TEXT_COL = 'cleaned_text'

In [33]:
import spacy
# Load a larger model with vectors
#nlp = spacy.load("en_core_web_lg")
nlp = spacy.load(MODEL)

Remove punctuation

In [34]:
from spacy.symbols import ORTH
import string
TWITTER_TOKENS = ['<url>', '<user>', '<smile>', '<lolface>','<sadface>','<neutralface>',
                 '<heart>','<number>','<allcaps>','<hashtag>','<repeat>','<elong>']

if MODEL_NAME.startswith('glove.'):
    for t in TWITTER_TOKENS:
        nlp.tokenizer.add_special_case(t,[{ORTH: t}])
else:
    table = str.maketrans('', '', string.punctuation)
    train['cleaned_text'] = train['cleaned_text'].str.translate(table).str.strip()
    test['cleaned_text'] = test['cleaned_text'].str.translate(table).str.strip()    

Insert keywords into text

In [35]:
train['keyword'].fillna('',inplace=True)
test['keyword'].fillna('',inplace=True)

In [36]:
train['cleaned_text_keyword'] = (train['keyword'] + ' ' + train[TEXT_COL]).str.strip()
test['cleaned_text_keyword'] = (test['keyword'] + ' ' + test[TEXT_COL]).str.strip()

Get wordvecs

In [37]:
train['nlp'] = train[TEXT_COL].apply(lambda s: nlp(s))
train['wordvec'] = train['nlp'].apply(lambda s: s.vector)
test['nlp'] = test[TEXT_COL].apply(lambda s: nlp(s))
test['wordvec'] = test['nlp'].apply(lambda s: s.vector)

In [38]:
train['keyword_nlp'] = train['cleaned_text_keyword'].apply(lambda s:nlp(s))
train['keyword_wordvec'] = train['keyword_nlp'].apply(lambda s: s.vector)
test['keyword_nlp'] = test['cleaned_text_keyword'].apply(lambda s:nlp(s))
test['keyword_wordvec'] = test['keyword_nlp'].apply(lambda s: s.vector)

## Inspect the generate wordvecs

In [39]:
def check(df, i):
    print(train['text'].iloc[i])
    print(train[TEXT_COL].iloc[i])
    for token in train['nlp'].iloc[i]:
        print(token, token.has_vector)

In [40]:
check(train, 1)

Forest fire near La Ronge Sask. Canada
forest fire near la ronge sask. canada
forest True
fire True
near True
la True
ronge True
sask True
. True
canada True


In [41]:
check(train, 99)

.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad
.<user> <hashtag> bahrain police had previously died in a road accident they were not killed by explosion <url>
.<user False
> True
<hashtag> True
bahrain True
police True
had True
previously True
died True
in True
a True
road True
accident True
they True
were True
not True
killed True
by True
explosion True
<url> True


In [42]:
check(train, 888)

@TradCatKnight (1) Russia may have played into reason but that link is BS.  Okanowa was bloody and mainline invasion looked like a bloody
<user> (<number>) russia may have played into reason but that link is bs <allcaps>. okanowa was bloody and mainline invasion looked like a bloody
<user> True
( True
<number> True
) True
russia True
may True
have True
played True
into True
reason True
but True
that True
link True
is True
bs True
<allcaps> True
. True
okanowa False
was True
bloody True
and True
mainline True
invasion True
looked True
like True
a True
bloody True


### OOV words

In [43]:
oov_tokens = dict()

def gather_tokens(oov_tokens, doc):
    for token in doc:
        if token.is_oov:
            if str(token).lower() in oov_tokens:
                oov_tokens[str(token).lower()] += 1
            else:
                oov_tokens[str(token).lower()] = 1

In [44]:
train['nlp'].apply(lambda x: gather_tokens(oov_tokens,x))
test['nlp'].apply(lambda x: gather_tokens(oov_tokens,x))

0       None
1       None
2       None
3       None
4       None
        ... 
3258    None
3259    None
3260    None
3261    None
3262    None
Name: nlp, Length: 3263, dtype: object

In [45]:
oov_df = pd.DataFrame({'token':list(oov_tokens.keys()), 'number':list(oov_tokens.values())})

In [46]:
oov_df = oov_df.sort_values(by='number',ascending=False)
print(oov_df.head(500).values)

[['allcaps><number' 190]
 ['\x89û' 87]
 ['\x89ûò' 66]
 ['number>%' 58]
 ['number><number' 54]
 ['.<user' 45]
 ['bioterror' 42]
 ['\x89ûó' 42]
 ['prebreak' 41]
 ['bioterrorism' 38]
 ['re\x89û' 37]
 ['soudelor' 36]
 ['number><number><number' 34]
 ['number>-year' 28]
 ['funtenna' 26]
 ['disea' 25]
 ['udhampur' 22]
 ['\x89ûï' 22]
 ['don\x89ûªt' 21]
 ['\x89ûïwhen' 18]
 ['allcaps><number><number><number' 18]
 ['crematoria' 17]
 ['number>+' 17]
 ['gt;&gt' 14]
 ['spos' 14]
 ['canaanites' 14]
 ['rea\x89û' 14]
 ['repeat><hashtag' 13]
 ['inundation' 12]
 ['it\x89ûªs' 12]
 ['bestnaijamade' 12]
 ['mediterran' 12]
 ['china\x89ûªs' 11]
 ['i\x89ûªm' 11]
 ['read\x89û' 11]
 ['linkury' 11]
 ['mhtw' 11]
 ['fnet' 11]
 ['yazidis' 10]
 ['sinjar' 10]
 ['å£<number' 9]
 ['by\x89û' 9]
 ['microlight' 9]
 ['rì' 9]
 ['lt;<number' 9]
 ['ices\x89û' 8]
 ['\x89û÷politics' 8]
 ['animalrescue' 8]
 ["allcaps>'t" 8]
 ['number>\x89û' 8]
 ['icemoon' 8]
 ['grief\x89ûª' 8]
 ['can\x89ûªt' 7]
 ['america\x89ûªs' 7]
 ['twia' 7]
 [

In [47]:
print("Fraction of tokens oov: ", 
      oov_df.number.sum() / (train['nlp'].apply(len).sum() + test['nlp'].apply(len).sum()))

Fraction of tokens oov:  0.023790731762318187


## Generate concatenated wordvecs

In [48]:
def concatenate(doc):    
    concat = numpy.empty((0, doc.vocab.vectors_length))
    for token in doc:
        concat = numpy.append(concat, [token.vector], axis = 0)
    return concat

In [49]:
train['wordvec_concat'] = train['nlp'].apply(lambda s: concatenate(s))
test['wordvec_concat'] = test['nlp'].apply(lambda s: concatenate(s))

In [50]:
train['wordvec_concat'][10].shape

(9, 25)

## Generated averaged wordvecs using tfidf

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [52]:
sentences = train['cleaned_text'].append(test['cleaned_text'],ignore_index=True)

In [53]:
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(sentences)
idf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [54]:
import numpy
def get_tfidf_mean(doc):
    avg = numpy.zeros(len(doc.vector))
    n = 0
    for token in doc:        
        if not token.is_oov:
            token_str = str(token).lower()
            if token_str in idf:
                avg += token.vector * idf[token_str]
                n += 1
    if n>0:
        avg = avg/n
    return avg

In [55]:
train['wordvec_tfidf'] = train['nlp'].apply(lambda s: get_tfidf_mean(s))
test['wordvec_tfidf'] = test['nlp'].apply(lambda s: get_tfidf_mean(s))

In [56]:
train['wordvec_tfidf']

0       [-0.4900269259053927, 1.0778943672776222, -1.7...
1       [-3.9615797315325056, -1.5446132038320814, -2....
2       [-2.1109436134045776, 3.079569337212227, -0.78...
3       [-1.9814261879239763, 4.416366151400974, -3.53...
4       [-2.43848686615626, 0.6247085710366567, 1.3067...
                              ...                        
7556    [-4.0677333739068775, 0.2842167301310433, 2.45...
7557    [-0.4930134703301721, 0.1357929338183668, -0.9...
7558    [-0.7093669772148132, 0.7391833066940308, 0.59...
7559    [-1.9778278940602352, 1.721481098940498, -0.25...
7560    [-1.0706927833909339, 0.13924438641829925, -0....
Name: wordvec_tfidf, Length: 7561, dtype: object

In [57]:
train['wordvec']

0       [0.032318536, 0.12929907, -0.35238042, 0.04862...
1       [-0.40937126, -0.32038173, -0.21201876, -0.004...
2       [-0.17538336, 0.25031605, -0.11056641, -0.1157...
3       [-0.062765546, 0.457585, -0.46744218, -0.26988...
4       [-0.22135386, 0.082693174, 0.080948114, 0.0187...
                              ...                        
7556    [-0.41963834, -0.021819353, 0.30132917, 0.0265...
7557    [0.027209358, -0.032375943, -0.13020572, -0.04...
7558    [0.3659297, -0.16286746, -0.12082411, -0.52468...
7559    [0.025096677, -0.031307478, -0.043185014, -0.1...
7560    [0.099660195, -0.14457195, -0.1661434, -0.3547...
Name: wordvec, Length: 7561, dtype: object

## To explore: use wordninja to cut up composite words/hashtags
Several OOV words seem to be composites that could be cut up. Wordninja can do this. For now just leave as is, since it's only 4% of the words.

In [58]:
import wordninja

In [59]:
wordninja.split('typhoondevastated')

['typhoon', 'devastated']

In [60]:
wordninja.split('mh370')

['mh', '370']

In [61]:
wordninja.split('prebreak')

['pre', 'break']

# Generate wordvecs for multiple models

In [62]:
models = ["wordvecs/glove.twitter.27B.25d",
          "wordvecs/glove.twitter.27B.50d",
          "wordvecs/glove.twitter.27B.100d",
          "wordvecs/glove.twitter.27B.200d",
          "en_core_web_lg"
         ]

In [63]:
for m in models:
    model_name = m.split('/')[-1]
    print("Loading model ",m)
    nlp = spacy.load(m)
    print("Wordvec length ",nlp.vocab.vectors_length)
    if model_name.startswith('glove.'):
        text_col = 'glove_cleaned_text'
    else:
        text_col = 'cleaned_text'
    
    train['nlp'] = train[text_col].apply(lambda s: nlp(s))
    train['wordvec'] = train['nlp'].apply(lambda s: s.vector)
    test['nlp'] = test[text_col].apply(lambda s: nlp(s))
    test['wordvec'] = test['nlp'].apply(lambda s: s.vector)
    
    train['cleaned_text_keyword'] = (train['keyword'] + ' ' + train[text_col]).str.strip()
    test['cleaned_text_keyword'] = (test['keyword'] + ' ' + test[text_col]).str.strip()
    train['keyword_nlp'] = train['cleaned_text_keyword'].apply(lambda s:nlp(s))
    train['keyword_wordvec'] = train['keyword_nlp'].apply(lambda s: s.vector)
    test['keyword_nlp'] = test['cleaned_text_keyword'].apply(lambda s:nlp(s))
    test['keyword_wordvec'] = test['keyword_nlp'].apply(lambda s: s.vector)
    
    oov_tokens = dict()
    
    train['nlp'].apply(lambda x: gather_tokens(oov_tokens,x))
    oov_df = pd.DataFrame({'token':list(oov_tokens.keys()), 'number':list(oov_tokens.values())})
    print("Fraction of tokens oov: ", round(100.0*
      oov_df.number.sum() / train['nlp'].apply(len).sum(),1), '%')
    
    train['wordvec_concat'] = train['nlp'].apply(lambda s: concatenate(s))
    test['wordvec_concat'] = test['nlp'].apply(lambda s: concatenate(s))
    
    train['wordvec_tfidf'] = train['nlp'].apply(lambda s: get_tfidf_mean(s))
    test['wordvec_tfidf'] = test['nlp'].apply(lambda s: get_tfidf_mean(s))
    
    columns_to_save = ['id', 'wordvec', 'keyword_wordvec', 'wordvec_concat', 'wordvec_tfidf']
    train[columns_to_save].to_pickle('train_wordvec_'+model_name+'.pickle')
    test[columns_to_save].to_pickle('test_wordvec_'+model_name+'.pickle')

Loading model  wordvecs/glove.twitter.27B.25d
Wordvec length  25
Fraction of tokens oov:  1.9 %
Loading model  wordvecs/glove.twitter.27B.50d
Wordvec length  50
Fraction of tokens oov:  1.9 %
Loading model  wordvecs/glove.twitter.27B.100d
Wordvec length  100
Fraction of tokens oov:  1.9 %
Loading model  wordvecs/glove.twitter.27B.200d
Wordvec length  200
Fraction of tokens oov:  1.9 %
Loading model  en_core_web_lg
Wordvec length  300
Fraction of tokens oov:  3.3 %


# Saving the data

The following wordvec data is produced:
* wordvec: wordvec from the tweet (average of the wordvec of all words in the tweet)
* keyword_wordvec: wordvec from keyword_nlp