### Creating Doc2Vec using Gensim

In [1]:
import glob
import pandas as pd
from nltk.tokenize import word_tokenize

from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
glob.glob("../data/cross_validation_data/1/*")

['../data/cross_validation_data/1\\test.csv',
 '../data/cross_validation_data/1\\train.csv']

In [4]:
print("reading csv")

reading csv


In [5]:
d_train = pd.read_csv("../data/cross_validation_data/1/train.csv", sep="\t")
d_test = pd.read_csv("../data/cross_validation_data/1/test.csv", sep="\t")

In [6]:
def remove_row_nan(df):
    df = df.dropna(axis = 0)
    return df

In [7]:
print("removing nan")

removing nan


In [8]:
d_train, d_test = remove_row_nan(d_train), remove_row_nan(d_test)

In [9]:
print("lower string")

lower string


In [10]:
d_train["question1"]=d_train.question1.str.lower()
d_train["question2"]=d_train.question2.str.lower()
d_test["question1"]=d_test.question1.str.lower()
d_test["question2"]=d_test.question2.str.lower()

In [11]:
print("tokenizing word")

tokenizing word


In [12]:
d_train["token_q1"] = d_train.question1.apply(word_tokenize)
d_train["token_q2"] = d_train.question2.apply(word_tokenize)

In [13]:
print("creating corpus")

creating corpus


In [14]:
corpus = d_train.token_q1.to_list() + d_train.token_q2.to_list()

In [15]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [16]:
print("length corpus:", len(corpus))

length corpus: 646856


In [17]:
print("creating tagged document")

creating tagged document


In [18]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]

In [19]:
print("building doc2vec")

building doc2vec


In [None]:
model = Doc2Vec(documents, vector_size=128, window=2, min_count=10, workers=4)

In [None]:
print("save model")

In [None]:
model.save("docvec")

In [None]:
print("done")

In [10]:
import numpy as np
import gensim
import scipy
print(np.__version__)
print(gensim.__version__)
print(scipy.__version__)

1.17.4
3.8.0
1.4.1


In [3]:
model = Doc2Vec.load("../data/resources/docvec")

In [4]:
model.infer_vector(["i", "want"])

array([ 3.0079054e-02, -8.1883557e-04, -3.4103159e-02,  5.6203216e-02,
        1.3287067e-02,  9.5929578e-03, -1.0291860e-02,  7.5030187e-03,
       -1.9734222e-03,  6.0621705e-03,  9.1177635e-03, -2.1957539e-02,
       -5.3026428e-04,  1.3007862e-02,  2.3858637e-02,  4.4290600e-03,
        8.9559885e-04,  3.3677801e-02, -1.6240969e-02, -9.0737436e-03,
        1.0757735e-02, -7.2356923e-03,  1.3366420e-04,  1.7594529e-02,
        3.0987356e-02, -2.4459340e-02, -4.8730206e-02,  3.7617337e-02,
        3.8791586e-02,  2.6294166e-02,  2.0356379e-02, -2.2179957e-02,
        4.7264598e-02,  3.7191764e-02,  1.5811352e-02,  3.0164875e-02,
       -6.4158058e-03, -5.0540995e-03,  7.4361794e-02, -3.2045614e-02,
        2.8220035e-02,  5.8442163e-03,  1.4935698e-03,  2.8745953e-02,
        3.1649310e-02,  2.9034408e-02,  6.6511948e-03,  6.6295345e-03,
       -1.7518697e-02,  2.0652642e-02,  1.6402680e-02,  3.9119287e-03,
       -3.5273898e-02, -4.9561206e-02,  1.2328478e-02, -1.9460229e-02,
      