<a href="https://www.inove.com.ar"><img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center"></a>


# Procesamiento de lenguaje natural
## Word2vect


In [None]:
import numpy as np

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [None]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [None]:
new_corpus = []
for document in corpus:
  document = document.split(" ")
  new_corpus.append(document)

vocabulary = {}
index = 0
for document in new_corpus:
  for term in document:
    if not term in vocabulary:
      vocabulary[term] = index
      index +=1

In [None]:
vocabulary

{'de': 6,
 'dia': 1,
 'el': 5,
 'es': 2,
 'gracias': 8,
 'hoy': 3,
 'martes': 4,
 'muchas': 7,
 'que': 0}

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [None]:
term2onehot = np.eye(len(vocabulary))

onehot_corpus = []
for document in new_corpus:
  onehot_doc = np.zeros((len(document), len(vocabulary)))
  for term in range(len(document)):
    onehot_doc[term, :] = term2onehot[vocabulary[document[term]]]
  onehot_corpus.append(np.array(onehot_doc))


In [None]:
onehot_corpus

[array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.]]),
 array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.]])]

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [None]:
freq_corpus = []
for document in onehot_corpus:
  freq_corpus.append(document.sum(axis=0))

In [None]:
np.array(freq_corpus)

array([[1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 2., 1., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 1., 1.]])

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [None]:
df_vocabulary = np.zeros(len(vocabulary))
for term in vocabulary:
  for document in new_corpus:
    if term in document:
      df_vocabulary[vocabulary[term]] += 1
    
df_vocabulary

array([1., 2., 2., 2., 2., 1., 1., 1., 1.])

In [None]:
n_doc = len(corpus)
n_doc

3

In [None]:
idf = np.log10(n_doc/df_vocabulary)
idf

array([0.47712125, 0.17609126, 0.17609126, 0.17609126, 0.17609126,
       0.47712125, 0.47712125, 0.47712125, 0.47712125])

In [None]:
tf_idf = np.zeros((n_doc, len(vocabulary)))

for i in range(n_doc):
  tf_idf[i, :] = np.multiply(idf, freq_corpus[i])

tf_idf

array([[0.47712125, 0.17609126, 0.17609126, 0.17609126, 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.17609126, 0.17609126, 0.17609126, 0.35218252,
        0.47712125, 0.47712125, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.17609126,
        0.        , 0.        , 0.47712125, 0.47712125]])

### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [None]:
def compare_documents(corpus, idx):

  # Separo los documentos en terminos
  new_corpus = []
  for document in corpus:
    document = document.split(" ")
    new_corpus.append(document)

  # Conformo el vocabulario
  vocabulary = {}
  index = 0
  for document in new_corpus:
    for term in document:
      if not term in vocabulary:
        vocabulary[term] = index
        index +=1

  # Represento el corpus en One Hot Encoding
  term2onehot = np.eye(len(vocabulary))

  onehot_corpus = []
  for document in new_corpus:
    onehot_doc = np.zeros((len(document), len(vocabulary)))
    for term in range(len(document)):
      onehot_doc[term, :] = term2onehot[vocabulary[document[term]]]
    onehot_corpus.append(np.array(onehot_doc))

  # Obtengo la frecuencia de cada termino en cada documento del corpus
  freq_corpus = []
  for document in onehot_corpus:
    freq_corpus.append(document.sum(axis=0))
  
  freq_corpus = np.array(freq_corpus)

  # Obtengo la frecuencia de aparicion de los terminos en el corpus
  df_vocabulary = np.zeros(len(vocabulary))
  for term in vocabulary:
    for document in new_corpus:
      if term in document:
        df_vocabulary[vocabulary[term]] += 1
  
  n_doc = len(corpus)

  # Obtengo la frecuencia inversa de aparicion de los terminos en el corpus
  idf = np.log10(n_doc/df_vocabulary)

  tf_idf = np.zeros((n_doc, len(vocabulary)))

  # Obtengo el indice TF-IDF
  for i in range(n_doc):
    tf_idf[i, :] = np.multiply(idf, freq_corpus[i])

  similarity = np.zeros(n_doc)
  for i in range(n_doc):
    similarity[i] = cosine_similarity(tf_idf[i,:], tf_idf[idx, :])

  return corpus[np.argsort(-similarity)]

In [None]:
compare_documents(corpus, 0)

array(['que dia es hoy', 'martes el dia de hoy es martes',
       'martes muchas gracias'], dtype='<U30')