# Doc2vec for tweets

## Loading data

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


## Tokenize

In [2]:
import gensim
def tokenize(df):
    return df['cleaned_text'].apply(lambda x: gensim.utils.simple_preprocess(x))

train['tokens'] = tokenize(train)
test['tokens'] = tokenize(test)

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokens
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,"[people, receive, wildfires, evacuation, order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[just, got, sent, this, photo, from, ruby, ala..."


Create one combined docvec for test and train

In [4]:
corpus = train[['id', 'tokens']].append(test[['id','tokens']], ignore_index=True)

In [5]:
len(corpus.id.unique())

10824

In [6]:
len(corpus)

10824

In [7]:
corpus['doc'] = corpus.apply(lambda x:gensim.models.doc2vec.TaggedDocument(x['tokens'], [x['id']]), axis=1)

# Train a docvec model

In [8]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=40)
model.build_vocab(corpus['doc'])

In [9]:
model.train(corpus['doc'], total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
train['docvec'] = train['id'].apply(lambda x:model.docvecs[x])
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokens,docvec
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[our, deeds, are, the, reason, of, this, earth...","[-0.16416408, 0.2690208, 0.28227437, 0.0581573..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]","[0.016598629, 0.04688884, 0.0010997389, 0.1621..."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[all, residents, asked, to, shelter, in, place...","[-0.16598955, 0.21933176, 0.22691013, 0.321768..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,"[people, receive, wildfires, evacuation, order...","[-0.025512252, 0.13011368, -0.03433391, 0.2083..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[just, got, sent, this, photo, from, ruby, ala...","[0.38398886, -0.14622848, -0.0997502, -0.01990..."


In [11]:
test['docvec'] = test['id'].apply(lambda x:model.docvecs[x])

In [12]:
test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text,tokens,docvec
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[just, happened, terrible, car, crash]","[0.035084017, 0.04976085, -0.054577623, 0.0054..."
1,2,,,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st...","[heard, about, earthquake, is, different, citi...","[-0.04257642, 0.13336797, 0.16396709, -0.08544..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...","[there, is, forest, fire, at, spot, pond, gees...","[-0.062548734, 0.3413463, 0.016899165, 0.11028..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires,"[apocalypse, lighting, spokane, wildfires]","[-0.0015694676, 0.1665464, 0.041762635, 0.1933..."
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills in China and Taiwan,"[typhoon, soudelor, kills, in, china, and, tai...","[0.12954275, -0.005062087, 0.072689354, -0.027..."


## Inspect the docvecs

In [13]:
def check(i):
    print("Sentence:")
    print(test.iloc[i].text)
    sims = model.docvecs.most_similar([test['docvec'].iloc[i]], topn=5)
    idx = [x[0] for x in sims]
    print("Most similar:")
    print(train[train['id'].isin(idx)].text.values)
    print(test[test['id'].isin(idx)].text.values)

In [14]:
check(1)

Sentence:
Heard about #earthquake is different cities, stay safe everyone.
Most similar:
['@DarrylB1979 yea heard about that..not coming out until 2017 and 2019 ?????? Vampiro is bleeding'
 'Kayla is about to electrocute herself.'
 "i'm really sad about red 7 closing :( yuppies n tourists ruin everything"]
['Heard about #earthquake is different cities, stay safe everyone.'
 '@5SOStag honestly he could say an apocalypse is coming and i would be exited hes so enthusiastic about everything']


In [15]:
check(8)

Sentence:
What a nice hat?
Most similar:
['[Comment] Deaths of older children: what do the data tell #US? http://t.co/p8Yr2po6Jn\n #nghlth'
 "What's missing in the #asae15 exhibitor emails? Value. http://t.co/r8cepRqxlE #assnchat"
 "@HomeworldGym @thisisperidot D: What? That's a tragedy. You have a wonderful nose"]
['What a nice hat?'
 'What scenes at Trent Bridge England could win the #Ashes today at this rate! #Pandemonium']


In [16]:
check(80)

Sentence:
@margaretcho Call me a fag and I'm going to call you an ambulance :) #RainbowPower
Most similar:
['Do me a favor and fall off a cliff']
["@margaretcho Call me a fag and I'm going to call you an ambulance :) #RainbowPower"
 "It's an apocalypse" '@Hazard_VN @AccuracyEsports add me'
 "'I lava you' ???? @kherr122"]


In [17]:
check(800)

Sentence:
@PahandaBear @Nethaera Yup EU crashed too :P
Most similar:
["&gt; Bin Laden family plane crashed after 'avoiding microlight and landing too far down runway... http://t.co/Tu9cgLmgVR #rochdale #heywood"
 '@olliebailey11 havnt you crashed ? ??'
 "#News Bin Laden family plane crashed after 'avoiding microlight and landing too far down runway' http://t.co/x9MDHocpda"
 '#TeamFollowBack Madhya Pradesh Train Derailment: Village Youth Saved Many Lives  #FollowBack']
['@PahandaBear @Nethaera Yup EU crashed too :P']


In [18]:
check(3000)

Sentence:
RT MMDA: ADVISORY: Stalled Bus at EDSA Service Road Cubao SB due to mechanical trouble as of 7:53 AM. 1 lane occupied. MMDA T/C on site. TÛ_
Most similar:
['Consent Order on cleanup underway at CSX derailment site - Knoxville News Sentinel http://t.co/xsZx9MWXYp http://t.co/NMFsgKf1Za'
 'KATUNews: #SR14 remains closed as brush fire burns 1700 acres: http://t.co/QposKp3MWj #LiveOnK2 http://t.co/mTQjsvupwy']
['Road Hazard @ E CONFEDERATE AVE SE / MORELAND AVE SE http://t.co/tym6tYmh4M'
 'RT MMDA: ADVISORY: Stalled Bus at EDSA Service Road Cubao SB due to mechanical trouble as of 7:53 AM. 1 lane occupied. MMDA T/C on site. T\x89Û_'
 'RT  ADVISORY: Stalled Bus at EDSA Service Road Cubao SB due to mechanical trouble as of 7:53 AM. 1 lane occupied.\x89Û_ https://t.co/HRNZKU66mm']


# Train a model using the docvecs

In [19]:
import numpy
def get_X(df):
    X = []
    X_ext = []
    xcols = []
    for index, row in df.iterrows():
        x = row['docvec']
        X.append(x)
        for xc in xcols:
            x = numpy.append(x, row[xc])        
        X_ext.append(x)
    return X, X_ext

In [20]:
X, X_ext = get_X(train)

In [21]:
y = train['target']

In [28]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter = 1000)

In [29]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=3, return_train_score=True, scoring='f1')
scores

{'fit_time': array([0.1003108 , 0.07584453, 0.05727005]),
 'score_time': array([0.00460052, 0.00407934, 0.00388622]),
 'test_score': array([0.64270833, 0.63867187, 0.6146789 ]),
 'train_score': array([0.67016492, 0.68557089, 0.68225927])}

In [30]:
scores['test_score'].mean()

0.6320197024719674

In [31]:
def prepare_submission(model, X, y, X_test, name):
    model.fit(X,y)
    pred = model.predict(X_test)
    submission = pd.DataFrame({"id":test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)

In [32]:
X_test, X_test_ext = get_X(test)
prepare_submission(clf, X, y, X_test, 'simple_docvec')

## Try with an SVM
The logistic regression seem somewhat underfittet, let's try with a slightly more complicated model.

In [34]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svm = SVC(kernel="rbf")
params = {'C': [0.05, 0.1, 0.2, 0.5, 0.75, 1, 1.5, 2, 4]}
clf = GridSearchCV(svm, params, scoring="f1", verbose=1, n_jobs=-2, cv=5)

In [35]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=3, return_train_score=True, scoring='f1')
scores

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:   43.6s finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:   45.9s finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:   49.5s finished


{'fit_time': array([46.09391379, 48.54659986, 52.2460258 ]),
 'score_time': array([0.85731983, 1.0179913 , 1.05004954]),
 'test_score': array([0.65558442, 0.62817322, 0.64300626]),
 'train_score': array([0.83130773, 0.83518656, 0.77546883])}

In [36]:
scores['test_score'].mean()

0.6422546330467159

In [37]:
X_test, X_test_ext = get_X(test)
prepare_submission(clf, X, y, X_test, 'svm_docvec')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:  1.9min finished


Still not so impressive... Perhaps not enough data? Use averaged wordvecs instead?