In [1]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Word embedding is a type of mapping that allows words with similar meaning to have similar representation. Word2vec is a group of related models that are used to produce word embeddings. These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words. There are two types of Word2vec: Skip gram, Continous Bag of Words(CBOW).

Skip gram passes a one-hot vector of size equal to vocabulary of natural corpora into a shallow 2 layer neural network. It contains a hidden layer of n dimensions where n is less than the vocabulary size. Finally k one-hot vectors are generated as output after softmax which represent the probabaility distribution over each word. Here k equals the window size. The hidden layer is the vector representation of the word.

CBOW swaps the input and output of a skip gram model. For generating the vector representation, all the examples of the target word are passed as input and averaged. 

Skip gram and CBOW work almost similarly with skip gram performing slightly better in case of rare words.

In [2]:
csv = 'df.csv'
df = pd.read_csv(csv, index_col=0)
df.head()

  mask |= (ar1 == a)


Unnamed: 0,text,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [3]:
df.tail()

Unnamed: 0,text,target
1596036,just woke up having no school is the best feel...,4
1596037,thewdb com very cool to hear old walt interviews,4
1596038,are you ready for your mojo makeover ask me fo...,4
1596039,happy th birthday to my boo of alll time tupac...,4
1596040,happy charitytuesday,4


In [4]:
df.loc[df.target == 4, 'target'] = 1

In [5]:
df.tail()

Unnamed: 0,text,target
1596036,just woke up having no school is the best feel...,1
1596037,thewdb com very cool to hear old walt interviews,1
1596038,are you ready for your mojo makeover ask me fo...,1
1596039,happy th birthday to my boo of alll time tupac...,1
1596040,happy charitytuesday,1


In [6]:
from sklearn.cross_validation import train_test_split

x = df.text
y = df.target
SEED = 2000

x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train), (len(x_train[y_train == 0]) / (len(x_train)*1.))*100, (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation), (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100, (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test), (len(x_test[y_test == 0]) / (len(x_test)*1.))*100, (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))



Train set has total 1564120 entries with 50.02% negative, 49.98% positive
Validation set has total 15960 entries with 49.45% negative, 50.55% positive
Test set has total 15961 entries with 49.68% negative, 50.32% positive


In [7]:
df.head()

Unnamed: 0,text,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [8]:
all_x = pd.concat([x_train, x_validation, x_test])
all_x.tail()

1286274    signed up for stumbleupon because uses it if s...
50691                         aww can not join need to study
1271252                         why do you dance really good
634177     hope get to sleep in tomorrow working in my da...
413732     just biked home froze my ass off but got good ...
Name: text, dtype: object

In [11]:
from tqdm import tqdm # makes the loops show a smart progress meter
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec # open source vector space modelling and topic modelling toolkit in python 
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils

def labelize_tweets_ug(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
  
all_x_w2v = labelize_tweets_ug(all_x, 'all')

  if sys.path[0] == '':


In [12]:
# Distributed Bag Of Words (DBOW)
from sklearn.linear_model import LogisticRegression

cores = multiprocessing.cpu_count()
model_ug_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha
    
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs
  
train_vecs_dbow = get_vectors(model_ug_dbow, x_train, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_train)
clf.score(validation_vecs_dbow, y_validation)

100%|██████████| 1596041/1596041 [00:00<00:00, 2188582.63it/s]
100%|██████████| 1596041/1596041 [00:01<00:00, 1294546.74it/s]
100%|██████████| 1596041/1596041 [00:01<00:00, 1294820.67it/s]
100%|██████████| 1596041/1596041 [00:01<00:00, 1324005.13it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2195970.18it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2191790.71it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 1913173.78it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2194809.58it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2194382.22it/s]
100%|██████████| 1596041/1596041 [00:01<00:00, 1407547.06it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2206682.78it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2207229.20it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2211783.46it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2194303.82it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2193724.96it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2199621.

0.7407894736842106

In [13]:
# Distributed Memory Concatenation (DMC)

cores = multiprocessing.cpu_count()
model_ug_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmc.build_vocab([x for x in tqdm(all_x_w2v)])

for epoch in range(30):
    model_ug_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmc.alpha -= 0.002
    model_ug_dmc.min_alpha = model_ug_dmc.alpha
   
train_vecs_dmc = get_vectors(model_ug_dmc, x_train, 100)
validation_vecs_dmc = get_vectors(model_ug_dmc, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dmc, y_train)
clf.score(validation_vecs_dmc, y_validation)

100%|██████████| 1596041/1596041 [00:00<00:00, 2113690.27it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2186402.46it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2193402.23it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2192162.50it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2185535.18it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2183696.54it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2193969.41it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2193750.12it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2214622.49it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2190714.09it/s]
100%|██████████| 1596041/1596041 [00:01<00:00, 1383265.98it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2191772.77it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2088941.08it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2185190.60it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2194223.26it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2190548.

0.668984962406015

In [14]:
# Distributed Memory Mean (DMM)

cores = multiprocessing.cpu_count()
model_ug_dmm = Doc2Vec(dm=1, dm_concat=0, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmm.build_vocab([x for x in tqdm(all_x_w2v)])

for epoch in range(30):
    model_ug_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmm.alpha -= 0.002
    model_ug_dmm.min_alpha = model_ug_dmm.alpha
    
train_vecs_dmm = get_vectors(model_ug_dmm, x_train, 100)
validation_vecs_dmm = get_vectors(model_ug_dmm, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dmm, y_train)
clf.score(validation_vecs_dmm, y_validation)

100%|██████████| 1596041/1596041 [00:00<00:00, 2145670.33it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2202716.11it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2201510.72it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2201828.60it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2190371.46it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2206384.59it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2214341.19it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2217681.86it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2218063.95it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2213027.94it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2216621.50it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2211573.02it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2219174.99it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2217457.07it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2206332.95it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2222379.

0.7335839598997493

In [15]:
# Distributed Bag Of Words (DBOW) + Distributed Memory Concatenation (DMC)

def get_concat_vectors(model1,model2, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = np.append(model1.docvecs[prefix], model2.docvecs[prefix])
        n += 1
    return vecs

train_vecs_dbow_dmc = get_concat_vectors(model_ug_dbow, model_ug_dmc, x_train, 200)
validation_vecs_dbow_dmc = get_concat_vectors(model_ug_dbow, model_ug_dmc, x_validation, 200)

clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmc, y_train)
clf.score(validation_vecs_dbow_dmc, y_validation)

0.7472431077694236

In [16]:
# Distributed Bag Of Words (DBOW) + Distributed Memory Mean (DMM)

train_vecs_dbow_dmm = get_concat_vectors(model_ug_dbow, model_ug_dmm, x_train, 200)
validation_vecs_dbow_dmm = get_concat_vectors(model_ug_dbow, model_ug_dmm, x_validation, 200)

clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmm, y_train)
clf.score(validation_vecs_dbow_dmm, y_validation)

0.7548872180451128

In [17]:
# Phrase Modelling
from gensim.models.phrases import Phrases, Phraser

tokenized_train = [t.split() for t in x_train]
phrases = Phrases(tokenized_train)
bigram = Phraser(phrases)

In [18]:
tokenized_train[0]

['your', 'not', 'pregnant', 'oh', 'no', 'what', 'shame']

In [19]:
x_train.index

Int64Index([ 288048,  357753,  420123,  348643, 1195630,  424869,  675535,
             475529,  682421,  434584,
            ...
            1010148, 1070094, 1278135,  223204, 1521275,  689718,  856013,
             324814,  430920,  879542],
           dtype='int64', length=1564120)

In [20]:
all_x.head()

288048                    your not pregnant oh no what shame
357753                                 cleaning the bathroom
420123     feeling left out you never recommend anything ...
348643     home sick what the hell wonder if it ll mutate...
1195630        your tweet reminded me that game was the shit
Name: text, dtype: object

In [21]:
x_train[100]

'body of missing northern calif girl found police have found the remains of missing northern california girl'

In [22]:
bigram[x_train[100].split()]

['body',
 'of',
 'missing',
 'northern',
 'calif',
 'girl',
 'found',
 'police',
 'have',
 'found',
 'the',
 'remains',
 'of',
 'missing',
 'northern_california',
 'girl']

In [23]:
def labelize_tweets_bg(tweets, label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(bigram[t.split()], [prefix + '_%s' % i]))
    return result
  
all_x = pd.concat([x_train, x_validation, x_test])
all_x_w2v_bg = labelize_tweets_bg(all_x, 'all')

  """


In [24]:
# DBOW with bigram

cores = multiprocessing.cpu_count()
model_bg_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dbow.build_vocab([x for x in tqdm(all_x_w2v_bg)])

for epoch in range(30):
    model_bg_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dbow.alpha -= 0.002
    model_bg_dbow.min_alpha = model_bg_dbow.alpha
    
train_vecs_dbow_bg = get_vectors(model_bg_dbow, x_train, 100)
validation_vecs_dbow_bg = get_vectors(model_bg_dbow, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dbow_bg, y_train)
clf.score(validation_vecs_dbow_bg, y_validation)

100%|██████████| 1596041/1596041 [00:00<00:00, 2199820.04it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2205556.61it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2210389.30it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2202511.02it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2206002.87it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2216660.40it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2209661.88it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2209222.89it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2216025.67it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 1985959.18it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2218683.67it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2215782.15it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2205834.95it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2217502.61it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2220043.41it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2221597.

0.7402255639097745

In [25]:
# DMC with bigram

cores = multiprocessing.cpu_count()
model_bg_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dmc.build_vocab([x for x in tqdm(all_x_w2v_bg)])

for epoch in range(30):
    model_bg_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dmc.alpha -= 0.002
    model_bg_dmc.min_alpha = model_bg_dmc.alpha
    
train_vecs_dmc_bg = get_vectors(model_bg_dmc, x_train, 100)
validation_vecs_dmc_bg = get_vectors(model_bg_dmc, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dmc_bg, y_train)
clf.score(validation_vecs_dmc_bg, y_validation)

100%|██████████| 1596041/1596041 [00:00<00:00, 2153388.26it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2207608.43it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2196827.75it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2199979.81it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2211301.26it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2191323.64it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2195309.09it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2216562.05it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2215155.26it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2216526.08it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2211411.56it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2213968.43it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2214743.39it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2213404.77it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2214983.02it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2212287.

0.6622180451127819

In [27]:
# DMM with bigram

cores = multiprocessing.cpu_count()
model_bg_dmm = Doc2Vec(dm=1, dm_mean=1, size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dmm.build_vocab([x for x in tqdm(all_x_w2v_bg)])

for epoch in range(30):
    model_bg_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dmm.alpha -= 0.002
    model_bg_dmm.min_alpha = model_bg_dmm.alpha
    
train_vecs_dmm_bg = get_vectors(model_bg_dmm, x_train, 100)
validation_vecs_dmm_bg = get_vectors(model_bg_dmm, x_validation, 100)

clf = LogisticRegression()
clf.fit(train_vecs_dmm_bg, y_train)
clf.score(validation_vecs_dmm_bg, y_validation)

100%|██████████| 1596041/1596041 [00:00<00:00, 2209602.07it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2198186.08it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2210006.20it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2209213.41it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2205611.11it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2207947.01it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2205170.82it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2210834.60it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2212211.78it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2210758.67it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2218838.84it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2212501.31it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2214090.72it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2220308.49it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2222181.29it/s]
100%|██████████| 1596041/1596041 [00:00<00:00, 2216195.

KeyboardInterrupt: 

In [28]:
# Trigram

tg_phrases = Phrases(bigram[tokenized_train])
trigram = Phraser(tg_phrases)