# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [11]:
!pip install gensim -U
!pip install pyreadline -U

Requirement already up-to-date: gensim in /opt/conda/lib/python3.8/site-packages (3.8.3)
Collecting pyreadline
  Downloading pyreadline-2.1.zip (109 kB)
[K     |████████████████████████████████| 109 kB 6.3 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyreadline
  Building wheel for pyreadline (setup.py) ... [?25ldone
[?25h  Created wheel for pyreadline: filename=pyreadline-2.1-py3-none-any.whl size=93834 sha256=3f84e790c28c8ed972421d1c176c2f0bf862af591ab404a0b22090abd6a80909
  Stored in directory: /root/.cache/pip/wheels/0e/6e/9d/402aa64e362e59c7032231a2e6942e647a6c12508d2c77fc4d
Successfully built pyreadline
Installing collected packages: pyreadline
Successfully installed pyreadline-2.1


In [12]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)

In [14]:
%config Completer.use_jedi = False

In [15]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])
d2v_model.

SyntaxError: invalid syntax (<ipython-input-15-ffcd093c7159>, line 3)

In [6]:
# How do we prepare these vectors to be used in a machine learning model?
vects=[[d2v_model.infer_vector(words)] for words in X_test]

In [10]:
vects[0]

[array([-4.12203372e-02,  1.11947395e-02, -3.46425846e-02,  5.54681085e-02,
        -8.12416710e-03, -8.20988603e-03,  1.16405170e-02,  8.85554496e-03,
        -2.33952925e-02,  1.61229055e-02, -3.88768571e-03, -2.06756368e-02,
         4.37365510e-02, -2.20154226e-02, -6.58262521e-02,  7.46201491e-03,
        -3.16491202e-02,  5.11832586e-05,  6.29147654e-03, -1.15191350e-02,
        -3.19484696e-02, -7.92766921e-04,  4.94552515e-02, -3.52812707e-02,
        -2.74179823e-04,  4.59239818e-02, -3.56174558e-02,  2.73412131e-02,
         4.07809727e-02,  2.01806035e-02,  3.15069333e-02, -1.29060745e-02,
         1.54101374e-02,  2.17919797e-02, -1.16179278e-02,  4.60356474e-03,
        -1.37527450e-03, -1.41135361e-02, -1.44341914e-02, -2.79032029e-02,
         3.24353874e-02,  4.66170646e-02, -1.81810781e-02, -2.06516478e-02,
         3.23860459e-02, -1.18665313e-02, -1.37402676e-02,  1.14969946e-02,
        -2.61908416e-02, -1.77049101e-03], dtype=float32)]