# Doc2vec for tweets

## Loading data

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


## Tokenize

In [8]:
#CountVectorizer
#token_pattern=r"(?u)\b\w\w+\b"

In [9]:
import gensim
def tokenize(df):
    return df['cleaned_text'].apply(lambda x: gensim.utils.simple_preprocess(x))

train['tokens'] = tokenize(train)
test['tokens'] = tokenize(test)

In [10]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord...","[people, receive, wildfires, evacuation, order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[just, got, sent, this, photo, from, ruby, ala..."


Create one combined docvec for test and train

In [11]:
corpus = train[['id', 'tokens']].append(test[['id','tokens']], ignore_index=True)

In [12]:
len(corpus.id.unique())

10824

In [13]:
len(corpus)

10824

In [14]:
corpus['doc'] = corpus.apply(lambda x:gensim.models.doc2vec.TaggedDocument(x['tokens'], [x['id']]), axis=1)

# Train a docvec model

In [84]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=40)
model.build_vocab(corpus['doc'])

In [85]:
model.train(corpus['doc'], total_examples=model.corpus_count, epochs=model.epochs)

In [86]:
train['docvec'] = train['id'].apply(lambda x:model.docvecs[x])
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokens,docvec
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[our, deeds, are, the, reason, of, this, earth...","[0.07160659, -0.09939749, -0.020211691, 0.2383..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]","[-0.066053666, 0.09137911, -0.12738536, -0.013..."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[all, residents, asked, to, shelter, in, place...","[-0.11283219, -0.11480036, 0.062498745, 0.4175..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfires evacuation ord...","[people, receive, wildfires, evacuation, order...","[-0.07823217, 0.17971702, 0.037797928, 0.05701..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[just, got, sent, this, photo, from, ruby, ala...","[-0.4549307, 0.06287247, -0.045966968, 0.02362..."


In [87]:
test['docvec'] = test['id'].apply(lambda x:model.docvecs[x])

In [88]:
test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text,tokens,docvec
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[just, happened, terrible, car, crash]","[0.02760788, -0.09535968, -0.23014063, 0.08919..."
1,2,,,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st...","[heard, about, earthquake, is, different, citi...","[0.110616155, 0.054517493, -0.08516176, 0.0229..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...","[there, is, forest, fire, at, spot, pond, gees...","[-0.14350007, -0.20678376, 0.020583905, 0.5115..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires,"[apocalypse, lighting, spokane, wildfires]","[-0.17284182, 0.11545763, 2.2395341e-05, 0.065..."
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills in China and Taiwan,"[typhoon, soudelor, kills, in, china, and, tai...","[0.15576042, -0.05989403, 0.10247851, 0.045230..."


## Inspect the docvecs

In [89]:
def check(i):
    print("Sentence:")
    print(test.iloc[i].text)
    sims = model.docvecs.most_similar([test['docvec'].iloc[i]], topn=5)
    idx = [x[0] for x in sims]
    print("Most similar:")
    print(train[train['id'].isin(idx)].text.values)
    print(test[test['id'].isin(idx)].text.values)

In [90]:
check(1)

Sentence:
Heard about #earthquake is different cities, stay safe everyone.
Most similar:
['@DarrylB1979 yea heard about that..not coming out until 2017 and 2019 ?????? Vampiro is bleeding'
 "i'm really sad about red 7 closing :( yuppies n tourists ruin everything"
 "I've been trying to write a theological short story about a monster living in a sinkhole. Then I heard about Brooklyn. #accidentalprophecy"]
['Heard about #earthquake is different cities, stay safe everyone.'
 '@5SOStag honestly he could say an apocalypse is coming and i would be exited hes so enthusiastic about everything']


In [91]:
check(8)

Sentence:
What a nice hat?
Most similar:
['Wait What??? http://t.co/uAVFRtlfs4 http://t.co/85G1pCcCXG'
 "'Well guess what young girls. You aren't damsels in distress. You aren't hostages to the words of your peers.' http://t.co/5XRC0a76vD"
 "@HomeworldGym @thisisperidot D: What? That's a tragedy. You have a wonderful nose"]
['What a nice hat?'
 'What it was like to survive the atomic bombing of Hiroshima http://t.co/0cvXS2E1Er']


In [92]:
check(80)

Sentence:
@margaretcho Call me a fag and I'm going to call you an ambulance :) #RainbowPower
Most similar:
['I LAVA YOU.']
["@margaretcho Call me a fag and I'm going to call you an ambulance :) #RainbowPower"
 "'I lava you' ???? @kherr122" 'I lava you ??'
 '@TheJasonTaylorR *EMS tries to stablize me and put me on a stretcher*']


In [93]:
check(800)

Sentence:
@PahandaBear @Nethaera Yup EU crashed too :P
Most similar:
["#News Bin Laden family plane crashed after 'avoiding microlight and landing too far down runway' http://t.co/x9MDHocpda"
 '@SmusX16475 Skype just crashed u host' "I'm so traumatised."]
['@PahandaBear @Nethaera Yup EU crashed too :P'
 'Holy crap! BRAVO Sir! Amazing! Dramatic Video Shows Plane Landing During Violent Storm http://t.co/xB0bw8h8Ur']


In [95]:
check(3000)

Sentence:
RT MMDA: ADVISORY: Stalled Bus at EDSA Service Road Cubao SB due to mechanical trouble as of 7:53 AM. 1 lane occupied. MMDA T/C on site. TÛ_
Most similar:
['Consent Order on cleanup underway at CSX derailment site - Knoxville News Sentinel http://t.co/GieSoMgWTR http://t.co/NMFsgKf1Za'
 'Consent Order on cleanup underway at CSX derailment site - Knoxville News Sentinel http://t.co/xsZx9MWXYp http://t.co/NMFsgKf1Za']
['Road Hazard @ E CONFEDERATE AVE SE / MORELAND AVE SE http://t.co/tym6tYmh4M'
 'RT MMDA: ADVISORY: Stalled Bus at EDSA Service Road Cubao SB due to mechanical trouble as of 7:53 AM. 1 lane occupied. MMDA T/C on site. T\x89Û_'
 'RT  ADVISORY: Stalled Bus at EDSA Service Road Cubao SB due to mechanical trouble as of 7:53 AM. 1 lane occupied.\x89Û_ https://t.co/HRNZKU66mm']


# Train a model using the docvecs

In [96]:
import numpy
def get_X(df):
    X = []
    X_ext = []
    xcols = []
    for index, row in df.iterrows():
        x = row['docvec']
        X.append(x)
        for xc in xcols:
            x = numpy.append(x, row[xc])        
        X_ext.append(x)
    return X, X_ext

In [97]:
X, X_ext = get_X(train)

In [98]:
y = train['target']

In [99]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter = 1000)

In [100]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500, n_jobs=-2)

In [101]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=3, return_train_score=True, scoring='f1')
scores

{'fit_time': array([8.4964397 , 7.73715162, 7.6328516 ]),
 'score_time': array([0.18169546, 0.1630733 , 0.16956115]),
 'test_score': array([0.65671642, 0.62714509, 0.66599395]),
 'train_score': array([1., 1., 1.])}

In [102]:
scores['test_score'].mean()

0.6499518164078221

In [103]:
def prepare_submission(model, X, y, X_test, name):
    model.fit(X,y)
    pred = model.predict(X_test)
    submission = pd.DataFrame({"id":test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)

In [104]:
X_test, X_test_ext = get_X(test)
prepare_submission(clf, X, y, X_test, 'simple_docvec')

Not so impressive... Perhaps not enough data? Use averaged wordvecs instead?