# Procesamiento de textos con doc2vec
## Entrenamiento

En primer lugar definiremos una función para la limpieza y tokenizado de los textos

In [1]:
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join

def nlp_clean(data):
   new_data = []
   for d in data:
      new_str = d.lower()
      dlist = tokenizer.tokenize(new_str)
      new_data.append(dlist)
   return new_data



La siguiente función genera un iterador a los documentos que posteriormente le pasaremos al modelo para su procesamiento

In [2]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
              yield gensim.models.doc2vec.TaggedDocument(doc,    
[self.labels_list[idx]])

Realizamos la configuración de los hyperparametros del modelo. En este caso los fijamos pero normalmente es necesario obtener los valores óptimos para estos hyperparámetros, como se realizo en el caso del KNN.

In [3]:
__path_documentos = './datasets/comentarios/'
__longitud_vector = 100
__ventana = 10
__frecuencia_minima = 5
__learning_rate = 0.025
__min_learning_rate = 0.025
__epoch = 100

Cargamos los ficheros que usaremso posteriormente

In [4]:
docLabels = []
docLabels = [f for f in listdir(__path_documentos) if 
 f.endswith('.txt')]
#Se guardan los nombres de los ficheros
data = []
for doc in docLabels:
    data.append(open(__path_documentos + doc).read())

Usamos la función que creamos anterirormente para tokenizar y limpiar los datos

In [5]:
tokenizer = RegexpTokenizer(r'\w+')    
print (data[0])
data = nlp_clean(data)
print (data[0])

#Creación del iterador
it = LabeledLineSentence(data, docLabels)

"BellÃ¬sima, imperdible,tiene detalles de azulejos decorados muy lindos, espaciosa, con un puente y un no se bien si es un pequeÃ±o lago artificial por donde se puede pasear en pequeÃ±as canoas simulando Venecia.Con escudos de todas las regiones de EspaÃ±a.Paseo recomendable."

['bellã', 'sima', 'imperdible', 'tiene', 'detalles', 'de', 'azulejos', 'decorados', 'muy', 'lindos', 'espaciosa', 'con', 'un', 'puente', 'y', 'un', 'no', 'se', 'bien', 'si', 'es', 'un', 'pequeã', 'o', 'lago', 'artificial', 'por', 'donde', 'se', 'puede', 'pasear', 'en', 'pequeã', 'as', 'canoas', 'simulando', 'venecia', 'con', 'escudos', 'de', 'todas', 'las', 'regiones', 'de', 'espaã', 'a', 'paseo', 'recomendable']


Con los textos cargados y la configuración establecida de los distintos hyperparamtros realizamos el entrenamiento del modelos

In [6]:
model = gensim.models.Doc2Vec(vector_size=__longitud_vector, window=__ventana, min_count=__frecuencia_minima, workers=11,alpha=__learning_rate, min_alpha=__min_learning_rate, epochs=__epoch)
model.build_vocab(it)
model.train(it, epochs=__epoch, total_examples=model.corpus_count)
model.save('./salidas/doc2vec/doc2vec.model')

## Transformar texto en vector
En primer lugar es necesario realizar la carga de un modelo, en ese caso no lo necesitamos por que ya lo tenemos en memoria, en caso de ser necesario la intruccion es: "model = gensim.models.doc2vec.Doc2Vec.load('path_al_modelo')".

Recorremos todos los documentos con los que se han entrenado el modelo para obtener sus vectores

In [7]:
file = open('./salidas/doc2vec/vectores.csv', 'w')
#Recorrido de todos los documentos con los que se ha entrenado
for i in range(len(model.dv)):
    vector = model.dv[i]
    file.writelines(";".join(str(x) for x in vector) + '\n')
    print (vector)

file.close()


[-2.5724804   0.03413742  0.05262691  0.5349716   0.913096   -0.2724024
  1.184056    0.9820937  -1.3885313   0.35857904  0.1611301  -1.1851412
  1.1584435  -0.47604236  1.2367635  -1.1148194   0.36548322  0.23400158
 -1.7035693  -1.5518163   1.8596222  -0.1963985  -0.09830236 -0.11678183
  0.39173508  0.37889135 -1.6994028  -0.20764326 -0.912357   -0.24214049
  1.7659042  -1.3358272   0.9854524  -1.618077   -0.887223    0.9537405
 -0.8368295   1.7182888   1.1988566  -2.5765326  -1.080147   -2.8157275
 -0.96205884 -0.8821649   0.841701   -0.05143828 -1.5774008  -0.48053327
  1.4531404   1.1126586   0.34315598 -0.34920105  0.03081594  0.6109789
  1.432348    0.06694332  0.05259216  0.27661267 -0.01288063  1.6116434
  0.8510626   0.26743248  1.7602674   0.5604014   0.1850686   2.586327
 -1.1219043   0.9522673  -0.48314217  0.6315145  -0.2896266  -1.2236437
  0.8640289   1.5898964   0.96348    -1.1135198   0.5903114   0.36478704
  0.68858397  0.6760677  -2.8471715   0.5289701  -0.00659686

[-0.5419023   0.29296196 -1.2279124  -0.34313235  0.04932047  0.46222982
  0.835161    1.168327    1.3090422  -0.46910858 -1.1306407  -0.8221331
 -0.60165864  0.618673    0.48150766 -0.6955907   0.2675295  -0.6913962
  0.73899716 -0.75988376  0.32166308 -0.14874476  0.17101364  0.41690183
  0.7609631   0.61502683 -0.5650681  -0.33904904 -0.6497057  -1.2488904
  1.7236085  -1.2387544   0.813711   -1.8127643  -0.5645258   0.55858517
 -0.566412   -0.2329905  -0.59222305 -0.4574488  -0.72596085 -0.9711194
 -0.7333203   0.7675686   1.1766843  -0.035547   -0.1288717   0.6198059
 -0.19018847  0.49878496 -0.8246781  -1.2560339   0.55401355 -1.1215079
 -0.5976559   0.03853428 -0.2884627  -1.1314317  -1.4891553  -0.22483476
 -0.7628189  -0.29233995 -0.42040408 -0.04732823 -1.5391767   1.0214521
 -1.0107154  -0.05388394 -0.42233342  0.5906176  -0.7182711  -0.19667433
  0.67101693 -0.02996024  0.6533763  -0.20516972 -0.5373607  -0.8542162
 -0.6811403  -0.9901958  -2.2307365   0.21286727 -2.1103144

[-6.3642390e-02  3.7822214e-01 -2.0174444e+00 -1.2078184e-01
 -3.9308602e-01 -1.0922360e+00 -2.0348498e-01  9.0998781e-01
 -2.7173266e-01 -3.4920070e-01  3.7563812e-02 -7.6631510e-01
 -1.2875307e+00  1.4684876e+00  1.4340483e-01  5.5817468e-04
  7.2826660e-01 -1.1634760e+00  1.7662461e-01 -6.3765997e-01
 -3.4881207e-01  1.2936955e-02  1.3027030e+00 -1.3216523e+00
  7.5822812e-01  2.2001994e+00 -6.6042441e-01  1.7734498e+00
 -6.9054484e-01  5.6019694e-01  1.1445565e-01  9.0730041e-02
  1.1202228e+00 -2.2638097e+00  3.2259935e-01  1.6885779e+00
  1.3255317e+00 -7.8543478e-01  1.4362330e+00  1.6205640e-01
 -4.5739436e-01 -3.0534220e-01 -6.6011602e-01  2.1428581e-01
  1.4162495e+00  3.0862185e-01  1.0960993e+00  9.1080397e-01
  2.6365486e-01 -4.1069698e-01  4.9858883e-01 -7.6258093e-01
  1.0709205e+00 -1.0758543e+00 -2.4831158e-01  9.4004077e-01
  9.2396855e-01 -2.4485449e-01  1.8931837e+00 -4.8904234e-01
  4.0686092e-01 -6.6968375e-01 -5.2548987e-01  6.4482057e-01
 -3.4592652e-01  9.44433

[-6.53339326e-01  3.48423451e-01 -4.49728638e-01  9.22045588e-01
  7.82959700e-01 -7.18654454e-01  9.82576311e-01 -4.97053117e-02
  8.14163014e-02 -8.61047566e-01 -6.46523535e-02 -1.06566787e+00
 -8.63056540e-01  3.82873535e-01 -8.50620329e-01  7.03221202e-01
 -7.37437606e-01  8.87365863e-02  3.47511321e-02 -1.42042589e+00
 -2.41646633e-01  5.95995188e-01  1.08120024e+00 -4.06332374e-01
  7.59694397e-01  4.82763588e-01 -1.11615217e+00 -9.73374426e-01
  1.73519284e-01 -2.07394555e-01  3.21434915e-01  8.75330865e-01
  5.77492893e-01  1.01784341e-01 -2.92150676e-01 -3.40774804e-01
  3.67619276e-01  2.68873721e-01  4.74539429e-01  1.49114519e-01
  1.13140047e+00 -1.24505222e+00 -2.32587129e-01 -5.71704924e-01
  4.15252537e-01  3.84332150e-01  3.16492826e-01 -6.47172689e-01
  1.22056186e-01  7.39792362e-02 -2.97783494e-01  2.53744513e-01
  4.19759005e-01  1.20408845e-03 -1.28571248e+00  1.05422652e+00
 -3.08276385e-01 -8.67095113e-01 -1.93422914e-01  4.62869927e-02
  2.65860140e-01  4.82966

[-1.9476489e+00  7.7219939e-01  2.7353245e-01 -3.6340028e-01
 -1.4107288e+00  4.8537731e-01  2.2087742e-01  6.6562593e-01
 -1.6078489e+00 -1.6023926e-01 -1.5374923e-01 -4.1483119e-01
 -6.3935781e-01  6.0151100e-01  6.0773039e-01  2.4892499e-01
 -4.6840185e-01 -2.0191557e+00 -1.0286249e+00 -2.3405530e+00
  6.5374917e-01  4.8679989e-02 -1.1829882e+00 -3.7167603e-01
  9.0607697e-01  1.9141589e-01 -5.6274343e-01 -5.8951452e-03
  6.2875092e-01 -4.5533338e-01 -1.0205344e+00  9.9830359e-01
  1.7825474e+00 -1.2394814e+00 -1.2685248e+00  1.4459807e+00
  2.6035741e-02 -1.1602465e+00 -7.1055174e-01  2.5229371e-01
 -1.4533314e-01 -9.9478030e-01 -2.0286644e+00  1.2366862e+00
 -2.1489619e-01  1.2248459e-01  1.1948920e+00 -9.6081984e-01
 -9.7460192e-01  6.3735169e-01  7.8458303e-01  1.2535207e+00
  5.5440363e-02  4.8256108e-01  5.5160981e-02  1.3283186e-01
  4.5176145e-01 -3.5343462e-01  4.1198671e-02  9.3951809e-01
  6.4234585e-01  1.1862373e+00  4.5012814e-01  7.1787930e-01
 -3.5368356e-01 -8.97957

[ 1.22749180e-01  1.47640872e+00  2.74451315e-01  1.12197459e+00
  1.53831196e+00 -1.63600600e+00  2.47850657e-01  7.25291789e-01
 -1.66804922e+00  1.61590827e+00 -8.02583635e-01 -1.93452394e+00
  2.07968056e-02 -2.78193682e-01  4.74385053e-01  1.04531753e+00
  1.98875749e+00 -1.10177040e-01 -5.49213111e-01 -2.33344030e+00
 -5.93639851e-01  1.17844474e+00  1.35911906e+00  6.18256986e-01
  4.94969971e-02  5.72702944e-01 -1.87023199e+00 -2.53057098e+00
 -3.12108725e-01 -1.47094536e+00  1.98073313e-01 -1.44215417e+00
  2.22050712e-01  5.90800345e-01 -2.52563071e+00  9.03733313e-01
 -2.00548196e+00  1.49076402e+00 -8.81685913e-01 -6.90714121e-01
  1.41494834e+00 -5.37841380e-01  7.57416368e-01  2.67792866e-03
 -6.66417032e-02 -1.17113374e-01 -1.51440215e+00 -5.06772324e-02
  1.76418626e+00 -1.10633385e+00  1.58630744e-01  4.81041223e-01
  9.37797785e-01 -1.09480716e-01 -1.55985260e+00  6.86516047e-01
  1.94906497e+00 -4.40234661e-01 -2.74656057e-01  2.11170244e+00
  3.55614483e-01 -5.24113

## Inferir vector para un nuevo texto
Al igual que en el punto anterior sería necesario cargar el modelo.

Una vez tenemos el modelo podemos inferir el vector para un nuevo texto

In [12]:
text = tokenizer.tokenize("playas con piedras")
print(text)

#Obtener el texto de un nuevo vector
vector = model.infer_vector(text)
print (vector)


['playas', 'con', 'piedras']
[-0.0774515   0.22507215 -0.1421558  -0.06062747  0.10542841 -0.07129125
  0.05995749  0.22907458 -0.06479006 -0.07194302 -0.19709404 -0.0374286
 -0.06840803  0.22289981 -0.05783862  0.05196756  0.05924656  0.08559593
 -0.02598655 -0.17833993  0.02904763  0.01813033 -0.06838774 -0.14607859
  0.06145636  0.12569863 -0.16453227 -0.09992886 -0.25344688 -0.06568827
  0.30239725  0.00646117  0.15687157 -0.06449677 -0.09471638  0.18602133
  0.05274434 -0.39327946  0.03139547 -0.07199967 -0.06095112 -0.04359394
 -0.18178512 -0.05870978  0.30393004  0.11667494 -0.05589957 -0.02132001
  0.00958883  0.06817779  0.16959104 -0.05963292  0.04342848 -0.10940745
  0.00772045  0.00968339  0.00503153 -0.00591306  0.03665687  0.13698179
 -0.06053478  0.15533552 -0.07889783 -0.02348473  0.08705897  0.2221147
  0.09020102  0.31617737 -0.22726671  0.29905796 -0.04074181 -0.13751768
  0.20917499 -0.11545394  0.14414811 -0.18234785  0.032892    0.13251741
 -0.05301579  0.1544303 