# Generate wordvecs for the tweets

## Loading data

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


## Add wordvecs

In [2]:
import spacy
# Load a larger model with vectors
nlp = spacy.load("en_core_web_lg")

Remove punctuation

In [3]:
import string
table = str.maketrans('', '', string.punctuation)
train['cleaned_text'] = train['cleaned_text'].str.translate(table).str.strip()
test['cleaned_text'] = test['cleaned_text'].str.translate(table).str.strip()

Remove whitespace

In [4]:
for _ in range(280):
    train['cleaned_text'] = train['cleaned_text'].str.replace('  ', ' ')
    test['cleaned_text'] = test['cleaned_text'].str.replace('  ', ' ')

Insert keywords into text

In [5]:
train['keyword'].fillna('',inplace=True)
test['keyword'].fillna('',inplace=True)

In [6]:
train['cleaned_text_keyword'] = (train['keyword'] + ' ' + train['cleaned_text']).str.strip()
test['cleaned_text_keyword'] = (test['keyword'] + ' ' + test['cleaned_text']).str.strip()

Get wordvecs

In [7]:
train['nlp'] = train['cleaned_text'].apply(lambda s: nlp(s))
train['wordvec'] = train['nlp'].apply(lambda s: s.vector)
test['nlp'] = test['cleaned_text'].apply(lambda s: nlp(s))
test['wordvec'] = test['nlp'].apply(lambda s: s.vector)

In [8]:
train['keyword_nlp'] = train['cleaned_text_keyword'].apply(lambda s:nlp(s))
train['keyword_wordvec'] = train['keyword_nlp'].apply(lambda s: s.vector)
test['keyword_nlp'] = test['cleaned_text_keyword'].apply(lambda s:nlp(s))
test['keyword_wordvec'] = test['keyword_nlp'].apply(lambda s: s.vector)

## Inspect the generate wordvecs

In [9]:
def check(df, i):
    print(train['text'].iloc[i])
    print(train['cleaned_text'].iloc[i])
    for token in train['nlp'].iloc[i]:
        print(token, token.has_vector)

In [10]:
check(train, 1)

Forest fire near La Ronge Sask. Canada
Forest fire near La Ronge Sask Canada
Forest True
fire True
near True
La True
Ronge False
Sask True
Canada True


In [11]:
check(train, 99)

.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad
Bahrain police had previously died in a road accident they were not killed by explosion
Bahrain True
police True
had True
previously True
died True
in True
a True
road True
accident True
they True
were True
not True
killed True
by True
explosion True


In [12]:
check(train, 888)

@TradCatKnight (1) Russia may have played into reason but that link is BS.  Okanowa was bloody and mainline invasion looked like a bloody
1 Russia may have played into reason but that link is BS Okanowa was bloody and mainline invasion looked like a bloody
1 True
Russia True
may True
have True
played True
into True
reason True
but True
that True
link True
is True
BS True
Okanowa False
was True
bloody True
and True
mainline True
invasion True
looked True
like True
a True
bloody True


### OOV words

In [13]:
oov_tokens = dict()

def gather_tokens(oov_tokens, doc):
    for token in doc:
        if token.is_oov:
            if str(token).lower() in oov_tokens:
                oov_tokens[str(token).lower()] += 1
            else:
                oov_tokens[str(token).lower()] = 1

In [14]:
train['nlp'].apply(lambda x: gather_tokens(oov_tokens,x))
test['nlp'].apply(lambda x: gather_tokens(oov_tokens,x))

0       None
1       None
2       None
3       None
4       None
        ... 
3258    None
3259    None
3260    None
3261    None
3262    None
Name: nlp, Length: 3263, dtype: object

In [15]:
oov_df = pd.DataFrame({'token':list(oov_tokens.keys()), 'number':list(oov_tokens.values())})

In [16]:
oov_df = oov_df.sort_values(by='number',ascending=False)
print(oov_df.head(500).values)

[['mh370' 94]
 ['prebreak' 41]
 ['typhoondevastated' 32]
 ['soudelor' 28]
 ['funtenna' 26]
 ['disea' 25]
 ['gbbo' 23]
 ['udhampur' 21]
 ['bayelsa' 21]
 ['marians' 20]
 ['enugu' 19]
 ['utc20150805' 17]
 ['sensorsenso' 17]
 ['gtgt' 16]
 ['selfimage' 16]
 ['spos' 15]
 ['time20150806' 14]
 ['mtvhottest' 13]
 ['abstorm' 13]
 ['sismo' 13]
 ['bestnaijamade' 12]
 ['mediterran' 12]
 ['hwo' 11]
 ['irandeal' 11]
 ['linkury' 11]
 ['trfc' 11]
 ['okwx' 10]
 ['beyhive' 10]
 ['o784' 10]
 ['meatloving' 10]
 ['yazidis' 10]
 ['wheavenly' 10]
 ['sinjar' 10]
 ['yearold' 10]
 ['summerfate' 10]
 ['inj' 9]
 ['tubestrike' 9]
 ['chicagoarea' 9]
 ['breakingnews' 9]
 ['runion' 9]
 ['animalrescue' 8]
 ['trapmusic' 8]
 ['icemoon' 8]
 ['igers' 8]
 ['olap' 8]
 ['mansehra' 7]
 ['twia' 7]
 ['waterresistant' 7]
 ['explosionproof' 7]
 ['30pm' 7]
 ['pantherattack' 7]
 ['bb17' 7]
 ['zouma' 7]
 ['wisenews' 7]
 ['kisii' 6]
 ['strategicpatience' 6]
 ['auspol' 6]
 ['yycstorm' 6]
 ['abbswinston' 6]
 ['gtgtgt' 6]
 ['kerricktrial

In [17]:
print("Fraction of tokens oov: ", 
      oov_df.number.sum() / (train['nlp'].apply(len).sum() + test['nlp'].apply(len).sum()))

Fraction of tokens oov:  0.04076309050291505


## Generate concatenated wordvecs

In [88]:
t = numpy.empty((0,3))
t

array([], shape=(0, 3), dtype=float64)

In [89]:
t = numpy.append(t, [[1,2,3]],axis=0)
t

array([[1., 2., 3.]])

In [91]:
t = numpy.append(t,[[4,5,6]],axis=0)
t

array([[1., 2., 3.],
       [4., 5., 6.]])

In [92]:
numpy.append(t,[[7,8,9]],axis=0)

array([[1., 2., 3.],
       [4., 5., 6.],
       [7., 8., 9.]])

In [93]:
def concatenate(doc):
    concat = numpy.empty((0,300))
    for token in doc:
        concat = numpy.append(concat, [token.vector], axis = 0)
    return concat

In [94]:
train['wordvec_concat'] = train['nlp'].apply(lambda s: concatenate(s))
test['wordvec_concat'] = test['nlp'].apply(lambda s: concatenate(s))

In [96]:
train['wordvec_concat'][10].shape

(9, 300)

## Generated averaged wordvecs using tfidf

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [98]:
sentences = train['cleaned_text'].append(test['cleaned_text'],ignore_index=True)

In [99]:
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(sentences)
idf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [100]:
import numpy
def get_tfidf_mean(doc):
    avg = numpy.zeros(len(doc.vector))
    n = 0
    for token in doc:        
        if not token.is_oov:
            token_str = str(token).lower()
            if token_str in idf:
                avg += token.vector * idf[token_str]
                n += 1
    if n>0:
        avg = avg/n
    return avg

In [101]:
train['wordvec_tfidf'] = train['nlp'].apply(lambda s: get_tfidf_mean(s))
test['wordvec_tfidf'] = test['nlp'].apply(lambda s: get_tfidf_mean(s))

In [102]:
train['wordvec_tfidf']

0       [-2.0410312242232838, 0.1577752003302941, -0.8...
1       [-0.27185601989428204, 0.2042857458194097, -1....
2       [0.07528745450756767, 0.11175614595413208, -0....
3       [-1.3403782035623277, 1.2000715562275477, 0.11...
4       [-0.7245167245467504, -0.364056259393692, 0.52...
                              ...                        
7556    [0.8706045945485433, -1.4970579627487395, -1.5...
7557    [-0.4535740460786555, 0.6074934605922964, 0.17...
7558    [0.804118087887764, 0.12538206577301025, 1.120...
7559    [0.3517053962192115, 0.9118178826482857, -1.03...
7560    [-0.40453157777136023, 1.0456219220703298, 0.1...
Name: wordvec_tfidf, Length: 7561, dtype: object

In [103]:
train['wordvec']

0       [-0.26623327, 0.05843069, -0.1404636, -0.05265...
1       [-0.025449565, 0.031005142, -0.15566371, -0.23...
2       [0.0059339865, 0.016337818, -0.105279535, -0.0...
3       [-0.18147185, 0.20731743, 0.014147284, -0.2182...
4       [-0.06394094, -0.01423019, 0.0063574947, 0.071...
                              ...                        
7556    [0.1382938, -0.1897513, -0.23208952, 0.0764361...
7557    [-0.035660267, 0.12369229, 0.0052933893, -0.09...
7558    [0.10752811, 0.074644, 0.046825, -0.2527535, 0...
7559    [0.03056033, 0.15288134, -0.13398015, 0.086398...
7560    [-0.08448874, 0.18175545, 0.011566181, -0.1192...
Name: wordvec, Length: 7561, dtype: object

## To explore: use wordninja to cut up composite words/hashtags
Several OOV words seem to be composites that could be cut up. Wordninja can do this. For now just leave as is, since it's only 4% of the words.

In [104]:
import wordninja

In [105]:
wordninja.split('typhoondevastated')

['typhoon', 'devastated']

In [106]:
wordninja.split('mh370')

['mh', '370']

In [107]:
wordninja.split('prebreak')

['pre', 'break']

# Saving the data

The following wordvec data is produced:
* wordvec: wordvec from the tweet (average of the wordvec of all words in the tweet)
* keyword_wordvec: wordvec from keyword_nlp

In [108]:
columns_to_save = ['id', 'wordvec', 'keyword_wordvec', 'wordvec_concat', 'wordvec_tfidf']

In [109]:
train[columns_to_save].to_pickle('train_wordvec.pickle')

In [110]:
test[columns_to_save].to_pickle('test_wordvec.pickle')