# Average wordvecs for tweets

## Loading data

In [146]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


## Add wordvecs

In [147]:
import spacy
# Load a larger model with vectors
nlp = spacy.load("en_core_web_lg")

Remove punctuation

In [148]:
import string
table = str.maketrans('', '', string.punctuation)
train['cleaned_text'] = train['cleaned_text'].str.translate(table).str.strip()
test['cleaned_text'] = test['cleaned_text'].str.translate(table).str.strip()

Remove whitespace

In [149]:
for _ in range(280):
    train['cleaned_text'] = train['cleaned_text'].str.replace('  ', ' ')
    test['cleaned_text'] = test['cleaned_text'].str.replace('  ', ' ')

Insert keywords into text

In [150]:
train['keyword'].fillna('',inplace=True)
test['keyword'].fillna('',inplace=True)

In [151]:
train['cleaned_text_keyword'] = (train['keyword'] + ' ' + train['cleaned_text']).str.strip()
test['cleaned_text_keyword'] = (test['keyword'] + ' ' + test['cleaned_text']).str.strip()

Get wordvecs

In [152]:
train['nlp'] = train['cleaned_text'].apply(lambda s: nlp(s))
train['wordvec'] = train['nlp'].apply(lambda s: s.vector)
test['nlp'] = test['cleaned_text'].apply(lambda s: nlp(s))
test['wordvec'] = test['nlp'].apply(lambda s: s.vector)

In [153]:
train['keyword_nlp'] = train['keyword'].apply(lambda s:nlp(s))
train['keyword_wordvec'] = train['keyword_nlp'].apply(lambda s: s.vector)
test['keyword_nlp'] = test['keyword'].apply(lambda s:nlp(s))
test['keyword_wordvec'] = test['keyword_nlp'].apply(lambda s: s.vector)

In [154]:
def check(df, i):
    print(train['text'].iloc[i])
    print(train['cleaned_text'].iloc[i])
    for token in train['nlp'].iloc[i]:
        print(token, token.has_vector)

In [155]:
check(train, 1)

Forest fire near La Ronge Sask. Canada
Forest fire near La Ronge Sask Canada
Forest True
fire True
near True
La True
Ronge False
Sask True
Canada True


In [156]:
check(train, 99)

.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad
Bahrain police had previously died in a road accident they were not killed by explosion
Bahrain True
police True
had True
previously True
died True
in True
a True
road True
accident True
they True
were True
not True
killed True
by True
explosion True


In [157]:
check(train, 888)

@TradCatKnight (1) Russia may have played into reason but that link is BS.  Okanowa was bloody and mainline invasion looked like a bloody
1 Russia may have played into reason but that link is BS Okanowa was bloody and mainline invasion looked like a bloody
1 True
Russia True
may True
have True
played True
into True
reason True
but True
that True
link True
is True
BS True
Okanowa False
was True
bloody True
and True
mainline True
invasion True
looked True
like True
a True
bloody True


### OOV words

In [158]:
oov_tokens = dict()

def gather_tokens(oov_tokens, doc):
    for token in doc:
        if token.is_oov:
            if str(token).lower() in oov_tokens:
                oov_tokens[str(token).lower()] += 1
            else:
                oov_tokens[str(token).lower()] = 1

In [159]:
train['nlp'].apply(lambda x: gather_tokens(oov_tokens,x))
test['nlp'].apply(lambda x: gather_tokens(oov_tokens,x))

0       None
1       None
2       None
3       None
4       None
        ... 
3258    None
3259    None
3260    None
3261    None
3262    None
Name: nlp, Length: 3263, dtype: object

In [160]:
oov_df = pd.DataFrame({'token':list(oov_tokens.keys()), 'number':list(oov_tokens.values())})

In [161]:
oov_df = oov_df.sort_values(by='number',ascending=False)
oov_df

Unnamed: 0,token,number
743,mh370,94
1933,prebreak,41
1219,typhoondevastated,32
817,soudelor,28
1932,funtenna,26
...,...,...
1617,aveblack,1
1618,jaxmk2,1
1619,fatalityuudlk,1
1620,us70,1


In [162]:
oov_df.loc[2].token

'cafire'

In [163]:
oov_df.loc[155].token

'afflecki'

In [164]:
print(oov_df.head(500).values)

[['mh370' 94]
 ['prebreak' 41]
 ['typhoondevastated' 32]
 ['soudelor' 28]
 ['funtenna' 26]
 ['disea' 25]
 ['gbbo' 23]
 ['udhampur' 21]
 ['bayelsa' 21]
 ['marians' 20]
 ['enugu' 19]
 ['utc20150805' 17]
 ['sensorsenso' 17]
 ['gtgt' 16]
 ['selfimage' 16]
 ['spos' 15]
 ['time20150806' 14]
 ['mtvhottest' 13]
 ['abstorm' 13]
 ['sismo' 13]
 ['bestnaijamade' 12]
 ['mediterran' 12]
 ['hwo' 11]
 ['irandeal' 11]
 ['linkury' 11]
 ['trfc' 11]
 ['okwx' 10]
 ['beyhive' 10]
 ['o784' 10]
 ['meatloving' 10]
 ['yazidis' 10]
 ['wheavenly' 10]
 ['sinjar' 10]
 ['yearold' 10]
 ['summerfate' 10]
 ['inj' 9]
 ['tubestrike' 9]
 ['chicagoarea' 9]
 ['breakingnews' 9]
 ['runion' 9]
 ['animalrescue' 8]
 ['trapmusic' 8]
 ['icemoon' 8]
 ['igers' 8]
 ['olap' 8]
 ['mansehra' 7]
 ['twia' 7]
 ['waterresistant' 7]
 ['explosionproof' 7]
 ['30pm' 7]
 ['pantherattack' 7]
 ['bb17' 7]
 ['zouma' 7]
 ['wisenews' 7]
 ['kisii' 6]
 ['strategicpatience' 6]
 ['auspol' 6]
 ['yycstorm' 6]
 ['abbswinston' 6]
 ['gtgtgt' 6]
 ['kerricktrial

In [165]:
print("Fraction of tokens oov: ", 
      oov_df.number.sum() / (train['nlp'].apply(len).sum() + test['nlp'].apply(len).sum()))

Fraction of tokens oov:  0.04076309050291505


## To explore: use wordninja to cut up composite words/hashtags

In [166]:
import wordninja

In [167]:
wordninja.split('typhoondevastated')

['typhoon', 'devastated']

In [168]:
wordninja.split('mh370')

['mh', '370']

In [169]:
wordninja.split('prebreak')

['pre', 'break']

## Train a model

In [128]:
import numpy
def get_X(df):
    X = []
    X_ext = []
    xcols = ['has_location', 'geocoded','longitude_n','latitude_n','num_hash_n','num_mention_n','num_url_n']
    for index, row in df.iterrows():
        x = row['wordvec']
        #x = numpy.append(x, row['keyword_wordvec'])
        
        X.append(x)
        for xc in xcols:
            x = numpy.append(x, row[xc])        
        X_ext.append(x)
    return X, X_ext

In [129]:
X, X_ext = get_X(train)

In [130]:
y = train['target']

In [131]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svm = SVC(kernel="rbf")
params = {'C': [0.05, 0.1, 0.2, 0.5, 0.75, 1, 1.5, 2, 4]}
clf = GridSearchCV(svm, params, scoring="f1", verbose=1, n_jobs=-2, cv=5)

In [132]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=3, return_train_score=True, scoring='f1')
scores

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.6min finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.8min finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  2.0min finished


{'fit_time': array([102.75017309, 111.90411711, 129.97967076]),
 'score_time': array([3.09891152, 2.84578514, 3.09524584]),
 'test_score': array([0.73401397, 0.7159035 , 0.78023033]),
 'train_score': array([0.79595449, 0.81373044, 0.80443548])}

In [133]:
scores['test_score'].mean()

0.7433825976979325

In [134]:
def prepare_submission(model, X, y, X_test, name):
    fit = model.fit(X,y)
    pred = model.predict(X_test)
    submission = pd.DataFrame({"id":test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)
    return fit

In [135]:
X_test, X_test_ext = get_X(test)
fit = prepare_submission(clf, X, y, X_test, 'avg_wordvec')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  4.4min finished


In [136]:
fit.best_params_

{'C': 1}

In [137]:
fit.cv_results_

{'mean_fit_time': array([16.2656002 , 16.06399727, 14.81328235, 13.62246323, 13.98820539,
        13.89963684, 13.16968436, 12.98290372, 13.06896777]),
 'std_fit_time': array([1.16269804, 0.50647231, 0.76079187, 1.23859031, 0.82218409,
        0.2663644 , 0.57257947, 0.71957938, 1.06073153]),
 'mean_score_time': array([3.66964693, 3.62519808, 3.01858544, 2.94939013, 2.89965215,
        2.81909885, 2.7464879 , 2.89028668, 2.63742886]),
 'std_score_time': array([0.52700126, 0.33103732, 0.29818703, 0.29587163, 0.23702686,
        0.22736502, 0.31430439, 0.32260753, 0.297495  ]),
 'param_C': masked_array(data=[0.05, 0.1, 0.2, 0.5, 0.75, 1, 1.5, 2, 4],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.05},
  {'C': 0.1},
  {'C': 0.2},
  {'C': 0.5},
  {'C': 0.75},
  {'C': 1},
  {'C': 1.5},
  {'C': 2},
  {'C': 4}],
 'split0_test_score': array([0.71715328, 0.72760181, 0.