# Average wordvecs for tweets

## Loading data

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


In [2]:
train_geocodes = pd.read_csv('train_geocodes.csv')
test_geocodes = pd.read_csv('test_geocodes.csv')

In [3]:
train = train.merge(train_geocodes, on=['id'])

In [4]:
test = test.merge(test_geocodes, on=['id'])

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,has_location,geocoded,longitude,latitude
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,False,False,,
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,False,False,,
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,False,False,,
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,False,False,,
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,False,False,,


In [6]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
test['longitude_n'] = test['longitude']/180
test['latitude_n'] = test['latitude']/180
train['longitude_n'] = train['longitude']/180
train['latitude_n'] = train['latitude']/180
test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text,has_location,geocoded,longitude,latitude,longitude_n,latitude_n
0,0,0,0,Just happened a terrible car crash,Just happened a terrible car crash,False,False,0.0,0.0,0.0,0.0
1,2,0,0,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st...",False,False,0.0,0.0,0.0,0.0
2,3,0,0,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...",False,False,0.0,0.0,0.0,0.0
3,9,0,0,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires,False,False,0.0,0.0,0.0,0.0
4,11,0,0,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills in China and Taiwan,False,False,0.0,0.0,0.0,0.0


In [7]:
from preprocessor.defines import Patterns
def fe_pattern(df, pattern, name):
    df[name] = df['text'].str.lower().apply(lambda x: pattern.findall(x))
    df['num_' + name] = df[name].apply(lambda x: len(x))

In [61]:
for df in [train, test]:
    fe_pattern(df, Patterns.HASHTAG_PATTERN,'hash')
    fe_pattern(df, Patterns.MENTION_PATTERN,'mention')
    fe_pattern(df, Patterns.URL_PATTERN,'url')
    df['num_hash_n'] = df['num_hash']/train['num_hash'].max()
    df['num_mention_n'] = df['num_mention']/train['num_mention'].max()
    df['num_url_n'] = df['num_url']/train['num_url'].max()

In [62]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,has_location,geocoded,longitude,latitude,...,num_hash,mention,num_mention,url,num_url,num_hash_n,num_mention_n,num_url_n,nlp,wordvec
0,1,0,0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,False,False,0.0,0.0,...,1,[],0,[],0,0.076923,0.0,0.0,"(Our, Deeds, are, the, Reason, of, this, earth...","[-0.26623327, 0.05843069, -0.1404636, -0.05265..."
1,4,0,0,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,False,False,0.0,0.0,...,0,[],0,[],0,0.0,0.0,0.0,"(Forest, fire, near, La, Ronge, Sask, Canada)","[-0.025449565, 0.031005142, -0.15566371, -0.23..."
2,5,0,0,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,False,False,0.0,0.0,...,0,[],0,[],0,0.0,0.0,0.0,"(All, residents, asked, to, shelter, in, place...","[0.0059339865, 0.016337818, -0.105279535, -0.0..."
3,6,0,0,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,False,False,0.0,0.0,...,1,[],0,[],0,0.076923,0.0,0.0,"(people, receive, wildfires, evacuation, order...","[-0.18147185, 0.20731743, 0.014147284, -0.2182..."
4,7,0,0,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,False,False,0.0,0.0,...,2,[],0,[],0,0.153846,0.0,0.0,"(Just, got, sent, this, photo, from, Ruby, Ala...","[-0.06394094, -0.01423019, 0.0063574947, 0.071..."


## Add wordvecs

In [63]:
import spacy
# Load a larger model with vectors
nlp = spacy.load("en_core_web_lg")

Remove punctuation

In [64]:
import string
table = str.maketrans('', '', string.punctuation)
train['cleaned_text'] = train['cleaned_text'].str.translate(table)
test['cleaned_text'] = test['cleaned_text'].str.translate(table)

Get wordvecs

In [65]:
train['nlp'] = train['cleaned_text'].apply(lambda s: nlp(s))
train['wordvec'] = train['nlp'].apply(lambda s: s.vector)

In [66]:
test['nlp'] = test['cleaned_text'].apply(lambda s: nlp(s))
test['wordvec'] = test['nlp'].apply(lambda s: s.vector)

In [67]:
def check(df, i):
    print(train['text'].iloc[i])
    print(train['cleaned_text'].iloc[i])
    for token in train['nlp'].iloc[i]:
        print(token, token.has_vector)

In [68]:
check(train, 1)

Forest fire near La Ronge Sask. Canada
Forest fire near La Ronge Sask Canada
Forest True
fire True
near True
La True
Ronge False
Sask True
Canada True


In [69]:
check(train, 99)

.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad
 Bahrain police had previously died in a road accident they were not killed by explosion
  False
Bahrain True
police True
had True
previously True
died True
in True
a True
road True
accident True
they True
were True
not True
killed True
by True
explosion True


In [70]:
check(train, 888)

@TradCatKnight (1) Russia may have played into reason but that link is BS.  Okanowa was bloody and mainline invasion looked like a bloody
1 Russia may have played into reason but that link is BS Okanowa was bloody and mainline invasion looked like a bloody
1 True
Russia True
may True
have True
played True
into True
reason True
but True
that True
link True
is True
BS True
Okanowa False
was True
bloody True
and True
mainline True
invasion True
looked True
like True
a True
bloody True


## Train a model

In [71]:
import numpy
def get_X(df):
    X = []
    X_ext = []
    xcols = ['has_location', 'geocoded','longitude_n','latitude_n','num_hash_n','num_mention_n','num_url_n']
    for index, row in df.iterrows():
        x = row['wordvec']
        X.append(x)
        for xc in xcols:
            x = numpy.append(x, row[xc])        
        X_ext.append(x)
    return X, X_ext

In [72]:
X, X_ext = get_X(train)

In [73]:
y = train['target']

In [74]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svm = SVC(kernel="rbf")
params = {'C': [0.1, 0.5, 0.75, 1, 1.5, 2, 4, 10]}
clf = GridSearchCV(svm, params, scoring="f1", verbose=1, n_jobs=-2, cv=5)

In [75]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X_ext, y, cv=3, return_train_score=True, scoring='f1')
scores

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:  1.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:  1.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed:  1.7min finished


{'fit_time': array([ 84.45139384,  86.04717088, 106.56588697]),
 'score_time': array([2.44626093, 2.3913517 , 3.15144372]),
 'test_score': array([0.73225806, 0.71194844, 0.76213592]),
 'train_score': array([0.79707219, 0.8106613 , 0.79604763])}

In [76]:
scores['test_score'].mean()

0.7354474750402972

In [77]:
def prepare_submission(model, X, y, X_test, name):
    fit = model.fit(X,y)
    pred = model.predict(X_test)
    submission = pd.DataFrame({"id":test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)
    return fit

In [None]:
X_test, X_test_ext = get_X(test)
fit = prepare_submission(clf, X_ext, y, X_test_ext, 'avg_wordvec')

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.


In [None]:
fit.best_params_

In [None]:
fit.cv_results_