In [4]:
import os
import sys
import gensim
import pandas as pd
import numpy as np

In [5]:
train_path = 'dataset/drop_dup.tsv'
test_path = 'dataset/test.tsv'

data = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, sep='\t', index_col=0)
data.head()

Unnamed: 0_level_0,SentenceId,Phrase,Sentiment,word_count,new_phrase,new_word_count
PhraseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,a series of escapades demonstrating the adage ...,1,37,series escapades demonstrating adage good goos...,15
2,1,a series of escapades demonstrating the adage ...,2,14,series escapades demonstrating adage good goose,6
5,1,series,2,1,series,1
8,1,escapades demonstrating the adage that what is...,2,11,escapades demonstrating adage good goose,5
9,1,escapades,2,1,escapades,1


# one-hot-encode output

In [3]:
from keras.utils.np_utils import to_categorical

labels = to_categorical(data.Sentiment, num_classes=5)
labels

Using TensorFlow backend.


array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [4]:
from sklearn.model_selection import train_test_split
SEED = 3000


X_train, X_test, y_train, y_test = train_test_split(
    data.Phrase, 
    labels, 
    test_size = 0.2, 
    random_state=SEED
                                                   )

print("X_train:",np.shape(X_train))
print("y_train:",np.shape(y_train))
print("X_test:",np.shape(X_train))
print("y_test:",np.shape(y_train))

X_train: (77135,)
y_train: (77135, 5)
X_test: (77135,)
y_test: (77135, 5)


### Labelize data

In [5]:
from gensim.models.doc2vec import LabeledSentence

def labelize_text(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
  
all_x = pd.concat([X_train,X_test])

X_train_pd = X_train
X_test_pd = X_test


all_x_w2v = labelize_text(all_x, 'ALL')
X_train = labelize_text(X_train, 'TRAIN')
X_test = labelize_text(X_test, 'TEST')

  import sys


In [6]:
all_x_w2v

[LabeledSentence(words=['shapely', 'than', 'the', 'two-hour', 'version', 'released', 'here', 'in', '1990'], tags=['ALL_8945']),
 LabeledSentence(words=['a', 'screenplay', 'more', 'ingeniously', 'constructed', 'than', '``', 'memento', "''"], tags=['ALL_134034']),
 LabeledSentence(words=['the', 'sights', 'and', 'sounds', 'of', 'the', 'wondrous', 'beats'], tags=['ALL_47469']),
 LabeledSentence(words=['greengrass', 'had', 'gone', 'a', 'tad', 'less', 'for', 'grit', 'and', 'a', 'lot', 'more', 'for', 'intelligibility'], tags=['ALL_74745']),
 LabeledSentence(words=['inspired'], tags=['ALL_473']),
 LabeledSentence(words=['love', 'him'], tags=['ALL_88602']),
 LabeledSentence(words=['the', 'more', 'daring', 'and', 'surprising', 'american', 'movies', 'of', 'the', 'year'], tags=['ALL_18196']),
 LabeledSentence(words=['of', 'solid', 'performances'], tags=['ALL_142991']),
 LabeledSentence(words=['recipe'], tags=['ALL_28016']),
 LabeledSentence(words=['mount'], tags=['ALL_71987']),
 LabeledSentence(wo

In [7]:
# create word_vector

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in all_x_w2v])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

def build_Word_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

# DBOW - Distributed Bag-of-Words

In [8]:
from gensim.models import Doc2Vec
import multiprocessing
from tqdm import tqdm
from sklearn import utils
from sklearn.preprocessing import scale
from keras.models import Sequential
from keras.layers import Dense

In [9]:
cores = multiprocessing.cpu_count()
model_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_x_w2v)])
model_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)


100%|██████████| 96419/96419 [00:00<00:00, 2587929.69it/s]
100%|██████████| 96419/96419 [00:00<00:00, 2563649.60it/s]
W0811 10:30:17.384974 140399816836864 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


In [10]:
def build_doc_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dbow[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

train_vecs_dbow = np.concatenate([build_doc_Vector(z, 100) for z in tqdm(map(lambda x: x.words, X_train))])
train_vecs_dbow = scale(train_vecs_dbow)
val_vecs_dbow = np.concatenate([build_doc_Vector(z, 100) for z in tqdm(map(lambda x: x.words, X_test))])
val_vecs_dbow = scale(val_vecs_dbow)

modelDoc2Vec_dbow = Sequential()
modelDoc2Vec_dbow.add(Dense(64, activation='relu', input_dim=100))
modelDoc2Vec_dbow.add(Dense(64, activation='relu'))
modelDoc2Vec_dbow.add(Dense(5, activation='sigmoid'))
modelDoc2Vec_dbow.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

modelDoc2Vec_dbow.fit(train_vecs_dbow, y_train, epochs=20, batch_size=32, verbose=2)
score = modelDoc2Vec_dbow.evaluate(val_vecs_dbow, y_test, batch_size=128, verbose=2)

print(score[1])

77135it [00:04, 16663.52it/s]
19284it [00:01, 16171.95it/s]
W0811 10:30:23.725247 140399816836864 deprecation_wrapper.py:119] From /home/gian_stk/anaconda3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0811 10:30:23.744752 140399816836864 deprecation_wrapper.py:119] From /home/gian_stk/anaconda3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0811 10:30:23.748630 140399816836864 deprecation_wrapper.py:119] From /home/gian_stk/anaconda3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0811 10:30:23.792340 140399816836864 deprecation_wrapper.py:119] From /home/gian_stk/anaconda3/lib/python3.5/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is de

Epoch 1/20
 - 4s - loss: 0.4022 - acc: 0.8278
Epoch 2/20
 - 4s - loss: 0.3809 - acc: 0.8380
Epoch 3/20
 - 4s - loss: 0.3719 - acc: 0.8411
Epoch 4/20
 - 4s - loss: 0.3654 - acc: 0.8434
Epoch 5/20
 - 4s - loss: 0.3605 - acc: 0.8454
Epoch 6/20
 - 4s - loss: 0.3566 - acc: 0.8465
Epoch 7/20
 - 4s - loss: 0.3531 - acc: 0.8479
Epoch 8/20
 - 4s - loss: 0.3505 - acc: 0.8489
Epoch 9/20
 - 4s - loss: 0.3481 - acc: 0.8498
Epoch 10/20
 - 4s - loss: 0.3460 - acc: 0.8506
Epoch 11/20
 - 4s - loss: 0.3438 - acc: 0.8514
Epoch 12/20
 - 4s - loss: 0.3422 - acc: 0.8520
Epoch 13/20
 - 4s - loss: 0.3407 - acc: 0.8529
Epoch 14/20
 - 4s - loss: 0.3390 - acc: 0.8531
Epoch 15/20
 - 4s - loss: 0.3377 - acc: 0.8538
Epoch 16/20
 - 4s - loss: 0.3367 - acc: 0.8545
Epoch 17/20
 - 4s - loss: 0.3356 - acc: 0.8547
Epoch 18/20
 - 5s - loss: 0.3344 - acc: 0.8546
Epoch 19/20
 - 4s - loss: 0.3338 - acc: 0.8555
Epoch 20/20
 - 4s - loss: 0.3328 - acc: 0.8559
0.8386331343151331


# DMC - Distributed Memory Concatenation

In [11]:
cores = multiprocessing.cpu_count()
model_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmc.build_vocab([x for x in tqdm(all_x_w2v)])
model_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)

def build_doc_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dmc[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

  
train_vecs_dmc = np.concatenate([build_doc_Vector(z, 100) for z in tqdm(map(lambda x: x.words, X_train))])
train_vecs_dmc = scale(train_vecs_dmc)


val_vecs_dmc = np.concatenate([build_doc_Vector(z, 100) for z in tqdm(map(lambda x: x.words, X_test))])
val_vecs_dmc = scale(val_vecs_dmc)

modelDoc2Vec_dmc = Sequential()
modelDoc2Vec_dmc.add(Dense(64, activation='relu', input_dim=100))
modelDoc2Vec_dmc.add(Dense(64, activation='relu'))
modelDoc2Vec_dmc.add(Dense(5, activation='sigmoid'))
modelDoc2Vec_dmc.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
modelDoc2Vec_dmc.fit(train_vecs_dmc, y_train, epochs=20, batch_size=32, verbose=2)
score = modelDoc2Vec_dmc.evaluate(val_vecs_dmc, y_test, batch_size=128, verbose=2)

print(score[1])

100%|██████████| 96419/96419 [00:00<00:00, 2674797.10it/s]
100%|██████████| 96419/96419 [00:00<00:00, 2763254.58it/s]
W0811 10:31:54.936285 140399816836864 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
77135it [00:04, 16526.51it/s]
19284it [00:01, 17013.82it/s]


Epoch 1/20
 - 4s - loss: 0.3794 - acc: 0.8393
Epoch 2/20
 - 4s - loss: 0.3680 - acc: 0.8442
Epoch 3/20
 - 4s - loss: 0.3638 - acc: 0.8457
Epoch 4/20
 - 4s - loss: 0.3607 - acc: 0.8461
Epoch 5/20
 - 4s - loss: 0.3584 - acc: 0.8470
Epoch 6/20
 - 4s - loss: 0.3563 - acc: 0.8473
Epoch 7/20
 - 4s - loss: 0.3544 - acc: 0.8480
Epoch 8/20
 - 4s - loss: 0.3526 - acc: 0.8485
Epoch 9/20
 - 4s - loss: 0.3515 - acc: 0.8490
Epoch 10/20
 - 4s - loss: 0.3502 - acc: 0.8496
Epoch 11/20
 - 4s - loss: 0.3490 - acc: 0.8497
Epoch 12/20
 - 4s - loss: 0.3480 - acc: 0.8502
Epoch 13/20
 - 5s - loss: 0.3471 - acc: 0.8501
Epoch 14/20
 - 4s - loss: 0.3461 - acc: 0.8509
Epoch 15/20
 - 4s - loss: 0.3454 - acc: 0.8508
Epoch 16/20
 - 4s - loss: 0.3445 - acc: 0.8514
Epoch 17/20
 - 4s - loss: 0.3438 - acc: 0.8516
Epoch 18/20
 - 4s - loss: 0.3431 - acc: 0.8517
Epoch 19/20
 - 4s - loss: 0.3423 - acc: 0.8521
Epoch 20/20
 - 4s - loss: 0.3417 - acc: 0.8523
0.8453329822701108


# DMM - Distributed Memory Mean

In [12]:
cores = multiprocessing.cpu_count()
model_dmm = Doc2Vec(dm=1, dm_mean=1, size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(all_x_w2v)])
model_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)

def build_doc_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dmm[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

  
train_vecs_dmm = np.concatenate([build_doc_Vector(z, 100) for z in tqdm(map(lambda x: x.words, X_train))])
train_vecs_dmm = scale(train_vecs_dmm)

val_vecs_dmm = np.concatenate([build_doc_Vector(z, 100) for z in tqdm(map(lambda x: x.words, X_test))])
val_vecs_dmm = scale(val_vecs_dmm)

modelDoc2Vec_dmm = Sequential()
modelDoc2Vec_dmm.add(Dense(64, activation='relu', input_dim=100))
modelDoc2Vec_dmm.add(Dense(64, activation='relu'))
modelDoc2Vec_dmm.add(Dense(5, activation='sigmoid'))
modelDoc2Vec_dmm.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
modelDoc2Vec_dmm.fit(train_vecs_dmm, y_train, epochs=20, batch_size=32, verbose=2)
score = modelDoc2Vec_dmm.evaluate(val_vecs_dmm, y_test, batch_size=128, verbose=2)
print (score[1])

100%|██████████| 96419/96419 [00:00<00:00, 2883621.38it/s]
100%|██████████| 96419/96419 [00:00<00:00, 2817426.60it/s]
W0811 10:33:45.815652 140399816836864 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay
77135it [00:04, 16505.53it/s]
19284it [00:01, 16918.97it/s]


Epoch 1/20
 - 5s - loss: 0.3847 - acc: 0.8367
Epoch 2/20
 - 5s - loss: 0.3750 - acc: 0.8411
Epoch 3/20
 - 4s - loss: 0.3718 - acc: 0.8418
Epoch 4/20
 - 4s - loss: 0.3697 - acc: 0.8429
Epoch 5/20
 - 4s - loss: 0.3680 - acc: 0.8435
Epoch 6/20
 - 4s - loss: 0.3667 - acc: 0.8436
Epoch 7/20
 - 4s - loss: 0.3656 - acc: 0.8441
Epoch 8/20
 - 4s - loss: 0.3647 - acc: 0.8442
Epoch 9/20
 - 4s - loss: 0.3636 - acc: 0.8448
Epoch 10/20
 - 4s - loss: 0.3626 - acc: 0.8451
Epoch 11/20
 - 4s - loss: 0.3618 - acc: 0.8451
Epoch 12/20
 - 4s - loss: 0.3612 - acc: 0.8451
Epoch 13/20
 - 4s - loss: 0.3605 - acc: 0.8458
Epoch 14/20
 - 4s - loss: 0.3597 - acc: 0.8459
Epoch 15/20
 - 4s - loss: 0.3591 - acc: 0.8459
Epoch 16/20
 - 4s - loss: 0.3583 - acc: 0.8465
Epoch 17/20
 - 4s - loss: 0.3578 - acc: 0.8463
Epoch 18/20
 - 4s - loss: 0.3572 - acc: 0.8464
Epoch 19/20
 - 4s - loss: 0.3570 - acc: 0.8466
Epoch 20/20
 - 4s - loss: 0.3564 - acc: 0.8466
0.8453537304479041


# Combined model

In [13]:
def get_concat_vectors(model1,model2, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.tags:
        prefix = 'all_' + str(i)
        vecs[n] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])
        n += 1
    return vecs

### DBOW + DMC

In [14]:
def build_doc_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += np.append(model_dbow[word] * tfidf[word], model_dmc[word] * tfidf[word])
            count += 1
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

In [15]:

train_vecs_dbow_dmc = np.concatenate([build_doc_Vector(z, 200) for z in tqdm(map(lambda x: x.words, X_train))])
train_vecs_dbow_dmc = scale(train_vecs_dbow_dmc)

val_vecs_dbow_dmc = np.concatenate([build_doc_Vector(z, 200) for z in tqdm(map(lambda x: x.words, X_test))])
val_vecs_dbow_dmc = scale(val_vecs_dbow_dmc)
                             
modelDBOW_DMC = Sequential()
modelDBOW_DMC.add(Dense(64, activation='relu', input_dim=200))
modelDBOW_DMC.add(Dense(64, activation='relu'))
modelDBOW_DMC.add(Dense(5, activation='sigmoid'))
modelDBOW_DMC.compile(
    optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy']
)

modelDBOW_DMC.fit(train_vecs_dbow_dmc, y_train, epochs=20, batch_size=32, verbose=2)
score = modelDBOW_DMC.evaluate(val_vecs_dbow_dmc, y_test, batch_size=128, verbose=2)

# y_pred= modelDBOW_DMC.predict(val_vecs_dbow_dmc).ravel()

print (score[1])

77135it [00:09, 8276.69it/s]
19284it [00:02, 8337.52it/s]


Epoch 1/20
 - 5s - loss: 0.3787 - acc: 0.8394
Epoch 2/20
 - 5s - loss: 0.3611 - acc: 0.8470
Epoch 3/20
 - 5s - loss: 0.3532 - acc: 0.8490
Epoch 4/20
 - 5s - loss: 0.3469 - acc: 0.8512
Epoch 5/20
 - 5s - loss: 0.3421 - acc: 0.8525
Epoch 6/20
 - 4s - loss: 0.3377 - acc: 0.8544
Epoch 7/20
 - 4s - loss: 0.3340 - acc: 0.8553
Epoch 8/20
 - 4s - loss: 0.3308 - acc: 0.8564
Epoch 9/20
 - 4s - loss: 0.3279 - acc: 0.8575
Epoch 10/20
 - 4s - loss: 0.3255 - acc: 0.8587
Epoch 11/20
 - 4s - loss: 0.3231 - acc: 0.8599
Epoch 12/20
 - 4s - loss: 0.3212 - acc: 0.8604
Epoch 13/20
 - 4s - loss: 0.3190 - acc: 0.8612
Epoch 14/20
 - 4s - loss: 0.3172 - acc: 0.8621
Epoch 15/20
 - 4s - loss: 0.3155 - acc: 0.8634
Epoch 16/20
 - 4s - loss: 0.3137 - acc: 0.8639
Epoch 17/20
 - 5s - loss: 0.3123 - acc: 0.8645
Epoch 18/20
 - 5s - loss: 0.3108 - acc: 0.8655
Epoch 19/20
 - 5s - loss: 0.3096 - acc: 0.8660
Epoch 20/20
 - 5s - loss: 0.3085 - acc: 0.8665
0.8443891931880784


### DBOW + DMC

In [16]:
def build_doc_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += np.append(model_dbow[word] * tfidf[word], model_dmm[word] * tfidf[word])
            count += 1
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

train_vecs_dbow_dmm = np.concatenate([build_doc_Vector(z, 200) for z in tqdm(map(lambda x: x.words, X_train))])
train_vecs_dbow_dmm = scale(train_vecs_dbow_dmm)

val_vecs_dbow_dmm = np.concatenate([build_doc_Vector(z, 200) for z in tqdm(map(lambda x: x.words, X_test))])
val_vecs_dbow_dmm = scale(val_vecs_dbow_dmm)

modelDBOW_DMM = Sequential()
modelDBOW_DMM.add(Dense(64, activation='relu', input_dim=200))
modelDBOW_DMM.add(Dense(64, activation='relu'))
modelDBOW_DMM.add(Dense(5, activation='sigmoid'))
modelDBOW_DMM.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

modelDBOW_DMM.fit(train_vecs_dbow_dmm, y_train, epochs=20, batch_size=32, verbose=2)
score = modelDBOW_DMM.evaluate(val_vecs_dbow_dmm, y_test, batch_size=128, verbose=2)

y_pred= modelDBOW_DMM.predict(val_vecs_dbow_dmm).ravel()

print (score[1])

77135it [00:09, 8243.44it/s]
19284it [00:02, 8220.82it/s]


Epoch 1/20
 - 5s - loss: 0.3824 - acc: 0.8371
Epoch 2/20
 - 4s - loss: 0.3668 - acc: 0.8443
Epoch 3/20
 - 4s - loss: 0.3597 - acc: 0.8463
Epoch 4/20
 - 4s - loss: 0.3542 - acc: 0.8481
Epoch 5/20
 - 4s - loss: 0.3492 - acc: 0.8492
Epoch 6/20
 - 4s - loss: 0.3451 - acc: 0.8506
Epoch 7/20
 - 4s - loss: 0.3417 - acc: 0.8522
Epoch 8/20
 - 4s - loss: 0.3387 - acc: 0.8532
Epoch 9/20
 - 4s - loss: 0.3358 - acc: 0.8544
Epoch 10/20
 - 4s - loss: 0.3335 - acc: 0.8548
Epoch 11/20
 - 4s - loss: 0.3314 - acc: 0.8560
Epoch 12/20
 - 5s - loss: 0.3295 - acc: 0.8564
Epoch 13/20
 - 4s - loss: 0.3278 - acc: 0.8579
Epoch 14/20
 - 4s - loss: 0.3265 - acc: 0.8578
Epoch 15/20
 - 4s - loss: 0.3250 - acc: 0.8586
Epoch 16/20
 - 4s - loss: 0.3235 - acc: 0.8594
Epoch 17/20
 - 4s - loss: 0.3226 - acc: 0.8600
Epoch 18/20
 - 4s - loss: 0.3213 - acc: 0.8604
Epoch 19/20
 - 4s - loss: 0.3205 - acc: 0.8608
Epoch 20/20
 - 4s - loss: 0.3193 - acc: 0.8613
0.8442025105133761


### Test with splitted tesing data 

In [17]:
print(np.shape(X_test))
print(np.shape(y_test))

(19284, 2)
(19284, 5)


In [18]:
pred_DBOW = modelDoc2Vec_dbow.evaluate(val_vecs_dbow, y_test)
pred_DMM = modelDoc2Vec_dmm.evaluate(val_vecs_dmc, y_test)
pred_DBOW_DMM = modelDBOW_DMC.evaluate(val_vecs_dbow_dmc, y_test)
pred_DBOW_DMM = modelDBOW_DMM.evaluate(val_vecs_dbow_dmm, y_test)



In [19]:
print("DBOW:",pred_DBOW)
print("DMM:",pred_DMM)
print("DBOW+DMM:",pred_DBOW_DMM)
print("DBOW_DMM:",pred_DBOW_DMM)

DBOW: [0.38418233780983546, 0.8386330585760636]
DMM: [0.44551995434322883, 0.8208462937420716]
DBOW+DMM: [0.3690496341516012, 0.8442024425757028]
DBOW_DMM: [0.3690496341516012, 0.8442024425757028]


# Test with real testig data

In [20]:
test_path = 'test.tsv'
sub = 'sampleSubmission.csv'

data_sub = pd.read_csv(test_path, sep='\t')
sub_file = pd.read_csv(sub)

data_sub.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [21]:
X_sub = data_sub.Phrase
X_sub = labelize_text(X_sub, 'TEST')
X_sub

  import sys


[LabeledSentence(words=['An', 'intermittently', 'pleasing', 'but', 'mostly', 'routine', 'effort', '.'], tags=['TEST_0']),
 LabeledSentence(words=['An', 'intermittently', 'pleasing', 'but', 'mostly', 'routine', 'effort'], tags=['TEST_1']),
 LabeledSentence(words=['An'], tags=['TEST_2']),
 LabeledSentence(words=['intermittently', 'pleasing', 'but', 'mostly', 'routine', 'effort'], tags=['TEST_3']),
 LabeledSentence(words=['intermittently', 'pleasing', 'but', 'mostly', 'routine'], tags=['TEST_4']),
 LabeledSentence(words=['intermittently', 'pleasing', 'but'], tags=['TEST_5']),
 LabeledSentence(words=['intermittently', 'pleasing'], tags=['TEST_6']),
 LabeledSentence(words=['intermittently'], tags=['TEST_7']),
 LabeledSentence(words=['pleasing'], tags=['TEST_8']),
 LabeledSentence(words=['but'], tags=['TEST_9']),
 LabeledSentence(words=['mostly', 'routine'], tags=['TEST_10']),
 LabeledSentence(words=['mostly'], tags=['TEST_11']),
 LabeledSentence(words=['routine'], tags=['TEST_12']),
 Labele

In [22]:
def build_doc_Vector_DBOW(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dbow[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_doc_Vector_DMC(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dmc[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_doc_Vector_SINGLE(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_dmm[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_doc_Vector_COMBINE(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += np.append(model_dbow[word] * tfidf[word], model_dmc[word] * tfidf[word])
            count += 1
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec


In [23]:
X_sub

[LabeledSentence(words=['An', 'intermittently', 'pleasing', 'but', 'mostly', 'routine', 'effort', '.'], tags=['TEST_0']),
 LabeledSentence(words=['An', 'intermittently', 'pleasing', 'but', 'mostly', 'routine', 'effort'], tags=['TEST_1']),
 LabeledSentence(words=['An'], tags=['TEST_2']),
 LabeledSentence(words=['intermittently', 'pleasing', 'but', 'mostly', 'routine', 'effort'], tags=['TEST_3']),
 LabeledSentence(words=['intermittently', 'pleasing', 'but', 'mostly', 'routine'], tags=['TEST_4']),
 LabeledSentence(words=['intermittently', 'pleasing', 'but'], tags=['TEST_5']),
 LabeledSentence(words=['intermittently', 'pleasing'], tags=['TEST_6']),
 LabeledSentence(words=['intermittently'], tags=['TEST_7']),
 LabeledSentence(words=['pleasing'], tags=['TEST_8']),
 LabeledSentence(words=['but'], tags=['TEST_9']),
 LabeledSentence(words=['mostly', 'routine'], tags=['TEST_10']),
 LabeledSentence(words=['mostly'], tags=['TEST_11']),
 LabeledSentence(words=['routine'], tags=['TEST_12']),
 Labele

In [24]:
test_vecs_each = np.concatenate([build_doc_Vector_DBOW(z, 100) for z in tqdm(map(lambda x: x.words, X_sub))])
test_vecs_each = scale(test_vecs_each)

test_vecs_dbow_combine = np.concatenate([build_doc_Vector_COMBINE(z, 200) for z in tqdm(map(lambda x: x.words, X_sub))])
test_vecs_dbow_combine = scale(test_vecs_dbow_combine)

66292it [00:03, 18469.69it/s]
66292it [00:07, 9065.04it/s] 


In [25]:
pred_dbow = modelDoc2Vec_dbow.predict(test_vecs_each)
pred_dmc = modelDoc2Vec_dmc.predict(test_vecs_each)
pred_dmm = modelDoc2Vec_dmm.predict(test_vecs_each)
pred_dbow_dmc = modelDBOW_DMC.predict(test_vecs_dbow_combine)
pred_dbow_dmm = modelDBOW_DMM.predict(test_vecs_dbow_combine)

### decode one-hot-encode sentiment back

In [26]:
decode_pred_dbow = np.round(np.argmax(pred_dbow,axis=1)).astype(int)
decode_pred_dmc = np.round(np.argmax(pred_dmc,axis=1)).astype(int)
decode_pred_dmm = np.round(np.argmax(pred_dmm,axis=1)).astype(int)
decode_pred_dbow_dmc = np.round(np.argmax(pred_dbow_dmc,axis=1)).astype(int)
decode_pred_dbow_dmm = np.round(np.argmax(pred_dbow_dmm,axis=1)).astype(int)

In [27]:
sub_path = 'sampleSubmission.csv'
sub_dbow = pd.read_csv(sub_path)
sub_dmc = pd.read_csv(sub_path)
sub_dmm = pd.read_csv(sub_path)
sub_dbow_dmc = pd.read_csv(sub_path)
sub_dbow_dmm = pd.read_csv(sub_path)

sub_dbow['Sentiment'] = decode_pred_dbow
sub_dmc['Sentiment'] = decode_pred_dmc
sub_dmm['Sentiment'] = decode_pred_dmm
sub_dbow_dmc['Sentiment'] = decode_pred_dbow_dmc
sub_dbow_dmm['Sentiment'] = decode_pred_dbow_dmm

sub_dbow.to_csv('dbow.csv', index = False)
sub_dmc.to_csv('dmc.csv', index = False)
sub_dmm.to_csv('dmm.csv', index = False)
sub_dbow_dmc.to_csv('dbowDmc.csv', index = False)
sub_dbow_dmm.to_csv('dbowDmm.csv', index = False)
# sub_file['Sentiment'] = decode_pred_dbow_dmc
# sub_file.to_csv("output_DBOW_DMC.csv", index = False)

## Visualized DBOW

In [28]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

word_vec = [model_dbow[w] for w in list(model_dbow.wv.vocab.keys())[:5000]]

from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vec)

tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(model_dbow.wv.vocab.keys())[:5000]

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.013s...
[t-SNE] Computed neighbors for 5000 samples in 5.716s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.006969
[t-SNE] KL divergence after 250 iterations with early exaggeration: 88.303116
[t-SNE] KL divergence after 1000 iterations: 4.122997


## Visualize DMC

In [29]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

word_vec = [model_dmc[w] for w in list(model_dmc.wv.vocab.keys())[:5000]]

from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vec)

tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(model_dmc.wv.vocab.keys())[:5000]

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.015s...
[t-SNE] Computed neighbors for 5000 samples in 5.845s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.055154
[t-SNE] KL divergence after 250 iterations with early exaggeration: 80.944168
[t-SNE] KL divergence after 1000 iterations: 2.328247


## Visualized DMM

In [30]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

word_vec = [model_dmm[w] for w in list(model_dmm.wv.vocab.keys())[:5000]]

from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vec)

tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(model_dmm.wv.vocab.keys())[:5000]

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.014s...
[t-SNE] Computed neighbors for 5000 samples in 3.913s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.054695
[t-SNE] KL divergence after 250 iterations with early exaggeration: 75.178467
[t-SNE] KL divergence after 1000 iterations: 1.820361
