# Average wordvecs for tweets

## Loading data

In [80]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


In [81]:
train_geocodes = pd.read_csv('train_geocodes.csv')
test_geocodes = pd.read_csv('test_geocodes.csv')

In [82]:
train = train.merge(train_geocodes, on=['id'])

In [83]:
test = test.merge(test_geocodes, on=['id'])

In [84]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,has_location,geocoded,longitude,latitude
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,False,False,,
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,False,False,,
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,False,False,,
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord...",False,False,,
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,False,False,,


In [102]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
test['longitude_n'] = test['longitude']/180
test['latitude_n'] = test['latitude']/180
train['longitude_n'] = train['longitude']/180
train['latitude_n'] = train['latitude']/180
test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text,has_location,geocoded,longitude,latitude,longitude_n,latitude_n
0,0,0,0,Just happened a terrible car crash,Just happened a terrible car crash,False,False,0.0,0.0,0.0,0.0
1,2,0,0,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st...",False,False,0.0,0.0,0.0,0.0
2,3,0,0,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...",False,False,0.0,0.0,0.0,0.0
3,9,0,0,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires,False,False,0.0,0.0,0.0,0.0
4,11,0,0,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills in China and Taiwan,False,False,0.0,0.0,0.0,0.0


In [115]:
from preprocessor.defines import Patterns
def fe_pattern(df, pattern, name):
    df[name] = df['text'].str.lower().apply(lambda x: pattern.findall(x))
    df['num_' + name] = df[name].apply(lambda x: len(x))

In [128]:
for df in [train, test]:
    fe_pattern(df, Patterns.HASHTAG_PATTERN,'hash')
    fe_pattern(df, Patterns.MENTION_PATTERN,'mention')
    fe_pattern(df, Patterns.URL_PATTERN,'url')
    df['num_hash_n'] = df['num_hash']/df['num_hash'].max()
    df['num_mention_n'] = df['num_mention']/df['num_mention'].max()
    df['num_url_n'] = df['num_url']/df['num_url'].max()

In [129]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,has_location,geocoded,longitude,latitude,...,latitude_n,hash,num_hash,mention,num_mention,url,num_url,num_hash_n,num_mention_n,num_url_n
0,1,0,0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,False,False,0.0,0.0,...,0.0,[#earthquake],1,[],0,[],0,0.076923,0.0,0.0
1,4,0,0,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,False,False,0.0,0.0,...,0.0,[],0,[],0,[],0,0.0,0.0,0.0
2,5,0,0,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,False,False,0.0,0.0,...,0.0,[],0,[],0,[],0,0.0,0.0,0.0
3,6,0,0,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord...",False,False,0.0,0.0,...,0.0,[#wildfires],1,[],0,[],0,0.076923,0.0,0.0
4,7,0,0,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,False,False,0.0,0.0,...,0.0,"[#alaska, #wildfires]",2,[],0,[],0,0.153846,0.0,0.0


## Add wordvecs

In [103]:
import spacy
# Load a larger model with vectors
nlp = spacy.load("en_core_web_lg")

In [104]:
train['wordvec'] = train['cleaned_text'].apply(lambda s: nlp(s).vector)

In [105]:
test['wordvec'] = test['cleaned_text'].apply(lambda s: nlp(s).vector)

## Train a model

In [130]:
import numpy
def get_X(df):
    X = []
    X_ext = []
    xcols = ['has_location', 'geocoded','longitude_n','latitude_n','num_hash_n','num_mention_n','num_url_n']
    for index, row in df.iterrows():
        x = row['wordvec']
        X.append(x)
        for xc in xcols:
            x = numpy.append(x, row[xc])        
        X_ext.append(x)
    return X, X_ext

In [131]:
X, X_ext = get_X(train)

In [132]:
y = train['target']

In [133]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter = 1000)

In [134]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X_ext, y, cv=3, return_train_score=True, scoring='f1')
scores

{'fit_time': array([0.22032309, 0.22159505, 0.27394748]),
 'score_time': array([0.00542021, 0.00490856, 0.00468802]),
 'test_score': array([0.72305312, 0.71741199, 0.76186003]),
 'train_score': array([0.79354996, 0.79444038, 0.78232971])}

In [135]:
scores['test_score'].mean()

0.7341083789765271

In [136]:
def prepare_submission(model, X, y, X_test, name):
    model.fit(X,y)
    pred = model.predict(X_test)
    submission = pd.DataFrame({"id":test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)

In [137]:
X_test, X_test_ext = get_X(test)
prepare_submission(clf, X_ext, y, X_test_ext, 'avg_wordvec')