# Doc2vec for tweets

## Loading data

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


## Tokenize

In [5]:
import gensim
def tokenize(df):
    return df['cleaned_text'].apply(lambda x: gensim.utils.simple_preprocess(x))

train['tokens'] = tokenize(train)
test['tokens'] = tokenize(test)

In [6]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord...","[people, receive, wildfires, evacuation, order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[just, got, sent, this, photo, from, ruby, ala..."


Create one combined docvec for test and train

In [14]:
corpus = train[['id', 'tokens']].append(test[['id','tokens']], ignore_index=True)

In [15]:
len(corpus.id.unique())

10824

In [16]:
len(corpus)

10824

In [18]:
corpus['doc'] = corpus.apply(lambda x:gensim.models.doc2vec.TaggedDocument(x['tokens'], [x['id']]), axis=1)

# Train a wordvec model

In [131]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=150, min_count=2, epochs=40)
model.build_vocab(corpus['doc'])

In [132]:
model.train(corpus['doc'], total_examples=model.corpus_count, epochs=model.epochs)

In [133]:
train['docvec'] = train['id'].apply(lambda x:model.docvecs[x])
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokens,docvec
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[our, deeds, are, the, reason, of, this, earth...","[0.042200312, 0.2232931, -0.106698625, -0.0192..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]","[0.03374243, 0.011420422, -0.039616883, 0.1343..."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[all, residents, asked, to, shelter, in, place...","[-0.027647117, 0.17371078, -0.28343678, -0.059..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord...","[people, receive, wildfires, evacuation, order...","[0.13867202, -0.09840676, 0.0020685194, 0.1192..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[just, got, sent, this, photo, from, ruby, ala...","[0.18006597, 0.16710344, 0.016685702, 0.152246..."


In [134]:
test['docvec'] = test['id'].apply(lambda x:model.docvecs[x])

In [135]:
test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text,tokens,docvec
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[just, happened, terrible, car, crash]","[-0.04802215, 0.026646828, -0.09073123, 0.0383..."
1,2,,,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st...","[heard, about, earthquake, is, different, citi...","[0.08368565, 0.16215791, -0.1111265, 0.1114999..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...","[there, is, forest, fire, at, spot, pond, gees...","[0.09617599, -0.021368692, 0.01661537, 0.16995..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires,"[apocalypse, lighting, spokane, wildfires]","[0.017286344, 0.027816098, -0.0823268, 0.05910..."
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills in China and Taiwan,"[typhoon, soudelor, kills, in, china, and, tai...","[0.027736768, -0.012589828, -0.07330252, -0.02..."


# Train a model using the docvecs

In [136]:
import numpy
def get_X(df):
    X = []
    X_ext = []
    xcols = []
    for index, row in df.iterrows():
        x = row['docvec']
        X.append(x)
        for xc in xcols:
            x = numpy.append(x, row[xc])        
        X_ext.append(x)
    return X, X_ext

In [137]:
X, X_ext = get_X(train)

In [138]:
y = train['target']

In [139]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=-2)

In [142]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=3, return_train_score=True, scoring='f1')
scores

{'fit_time': array([2.01290011, 1.95815921, 1.93892479]),
 'score_time': array([0.04474235, 0.04596305, 0.04681039]),
 'test_score': array([0.65201072, 0.62023386, 0.66803489]),
 'train_score': array([1.        , 1.        , 0.99976846])}

In [143]:
scores['test_score'].mean()

0.6467598240718772

In [129]:
def prepare_submission(model, X, y, X_test, name):
    model.fit(X,y)
    pred = model.predict(X_test)
    submission = pd.DataFrame({"id":test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)

In [130]:
X_test, X_test_ext = get_X(test)
prepare_submission(clf, X, y, X_test, 'simple_docvec')