# Clonamos el repositorio para obtener los dataSet

In [None]:
!git clone https://github.com/joanby/tensorflow.git

# Damos acceso a nuestro Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Test it

In [None]:
!ls '/content/drive/My Drive' 

# Google colab tools

In [None]:
from google.colab import files # Para manejar los archivos y, por ejemplo, exportar a su navegador
import glob # Para manejar los archivos y, por ejemplo, exportar a su navegador
from google.colab import drive # Montar tu Google drive

##Especificando la versión de TensorFlow

Ejecutando "importar tensorflow" importará la versión por defecto (actualmente 2.x). Puedes usar la 1.x ejecutando una celda con la "versión mágica de tensorflow" **antes de ejecutar "importar tensorflow".

### Si no funciona hacer el pip install


In [None]:
#!pip install tensorflow==1.14
%tensorflow_version 1.x

# Importar Tensorflow

In [None]:
import tensorflow as tf
print(tf.__version__)
import matplotlib.pyplot as plt

In [None]:
session = tf.Session()

# Bag of Words Contínuo o Skip Gramas

- King - man + woman = Queen
- Indian Pale Ale - hops + malt = Stout

- Si queremos predecir una palabra objetivo a partir de un contexto (palabras que la rodean): Continuous Bag of Words
- Si queremos predecir las palabras que rodean (contexto) a una palabra objetivo: Skip-Grama

## Descarga de datos

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import string
import requests
import collections
import io
import tarfile
import urllib.request
import nltk
from nltk.corpus import stopwords
session = tf.Session()

In [None]:
batch_size = 50
embedding_size = 200
vocabulary_size = 10000
generations = 50000
print_loss_every = 500
num_sampled = int(batch_size/2)
window_size = 2
nltk.download('stopwords')
stops = stopwords.words('english')
print_valid_every = 2000
valid_words = ['cliche', 'love', 'hate', 'silly', 'sad']

In [None]:
stops

In [None]:
def load_movies_data():
    save_folder_name = "../../datasets/movies_data"
    pos_file = os.path.join(save_folder_name, 'rt-polarity.pos')
    neg_file = os.path.join(save_folder_name, 'rt-polarity.neg')
    
    if os.path.exists(save_folder_name):
        ## Podemos cargar la info directamente desde el PC
        pos_data = []
        with open(pos_file, 'r') as temp_pos_file:
            for row in temp_pos_file:
                pos_data.append(row)
                
        neg_data = []
        with open(neg_file, 'r') as temp_neg_file:
            for row in temp_neg_file:
                neg_data.append(row)
        
    else:
        ## Debemos descargar los ficheros de internet y guardarlos en esta carpeta
        url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz"
        stream_data = urllib.request.urlopen(url)
        tmp = io.BytesIO()
        while True:
            s = stream_data.read(16384)
            if not s: 
                break
            tmp.write(s)
        stream_data.close()
        tmp.seek(0)
        
        tar_file = tarfile.open(fileobj=tmp, mode='r:gz')
        pos = tar_file.extractfile('rt-polaritydata/rt-polarity.pos')
        neg = tar_file.extractfile('rt-polaritydata/rt-polarity.neg')
        
        pos_data = []
        for line in pos:
            pos_data.append(line.decode('ISO-8859-1').encode('ascii', errors='ignore').decode())
            
        neg_data = []
        for line in neg:
            neg_data.append(line.decode('ISO-8859-1').encode('ascii', errors='ignore').decode())
            
        tar_file.close()
        
        if not os.path.exists(save_folder_name):
            os.makedirs(save_folder_name)
        with open(pos_file, 'w') as pos_file_handler:
            pos_file_handler.write(''.join(pos_data))
        with open(neg_file, 'w') as neg_file_handler:
            neg_file_handler.write(''.join(neg_data))
    
    texts = pos_data + neg_data
    target = [1]*len(pos_data) + [0]*len(neg_data)
    return (texts, target)         

In [None]:
texts, target = load_movies_data()

In [None]:
texts[0]

In [None]:
target[0]

## Limpieza de datos

In [None]:
def normalize_text(texts, stops):
    texts = [x.lower() for x in texts]
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
    texts = [' '.join(word for word in x.split() if word not in (stops)) for x in texts]
    texts = [' '.join(x.split()) for x in texts]
    return texts

In [None]:
texts = normalize_text(texts, stops)

In [None]:
texts[0]

In [None]:
len(texts)

In [None]:
target = [target[ix] for ix, x in enumerate(texts) if len(x.split())>2]
texts = [x for x in texts if len(x.split())>2]

In [None]:
len(texts)

In [None]:
def build_dictionary(sentences, vocabulary_size):
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]
    count = [['RARE', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    word_dict = {}
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    return word_dict

In [None]:
word_dict = build_dictionary(texts, vocabulary_size)

In [None]:
word_dict

In [None]:
def text_to_numbers(sentences, word_dict):
    data = []
    for sentence in sentences:
        sentence_data = []
        for word in sentence:
            if word in word_dict:
                word_ix = word_dict[word]#posición/ID de la palabra en el word dict
            else:
                word_ix = 0 ##posición/ID de la palabra RARE
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return data

In [None]:
word_dict_rev = dict(zip(word_dict.values(), word_dict.keys()))
word_dict_rev

In [None]:
text_data = text_to_numbers(texts, word_dict)

In [None]:
text_data[0]

In [None]:
valid_examples = [word_dict[x] for x in valid_words]

In [None]:
valid_examples

In [None]:
def generate_batch_data(sentences, batch_size, window_size, method = 'skip_gram'):
    '''
        Mi perro come su comida -> (Mi, come), (perro, come), (su, come), (comida, come)
    '''
    batch_data = []
    label_data = []
    
    while len(batch_data) < batch_size:
        rand_sentences = np.random.choice(sentences)
        window_seq = [rand_sentences[max((ix-window_size),0):(ix+window_size+1)] 
                      for ix, x in enumerate(rand_sentences)]
        label_idx = [ix if ix < window_size else window_size for ix, x in enumerate(window_seq)]
        
        if method == 'skip_gram':
            batch_and_labels = [(x[y], x[:y]+x[(y+1):]) for x,y in zip(window_seq, label_idx)]
            tuple_data = [(x,y_) for x, y in batch_and_labels for y_ in y]
        else:
            raise ValueError("Método {} no implementado".format(method))
        
        batch, labels = [list(x) for x in zip(*tuple_data)]
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
        
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return (batch_data, label_data)

## Entrenar con los Skip Grams

In [None]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1, 1))
x_inputs = tf.placeholder(tf.int32, shape =[batch_size])
y_target = tf.placeholder(tf.int32, shape = [batch_size,1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [None]:
embed = tf.nn.embedding_lookup(embeddings, x_inputs)

In [None]:
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], 
                                              stddev=1.0/np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [None]:
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases,
                                    inputs=embed, labels=y_target, 
                                     num_sampled = num_sampled, num_classes=vocabulary_size))

In [None]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1, keepdims=True))
normalized_embeddings = embeddings/norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [None]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

In [None]:
init = tf.global_variables_initializer()
session.run(init)

In [None]:
loss_vect = []
loss_x_vect = []
for i in range(generations):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size)
    feed_dict = {x_inputs: batch_inputs, y_target: batch_labels}
    session.run(optimizer, feed_dict=feed_dict)
    
    if (i+1) % print_loss_every == 0:
        loss_val = session.run(loss, feed_dict=feed_dict)
        loss_vect.append(loss_val)
        loss_x_vect.append(i+1)
        print("Iteración {}, Pérdida: ".format(i+1, loss_val))
    
    ## Validación de palabras más cercanas a las 5 seleccionadas
    if (i+1) % print_valid_every == 0:
        sim = session.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dict_rev[valid_examples[j]]
            top_k = 10
            nearest = (-sim[j,:]).argsort()[1:top_k+1]
            log_string = "Palabras cercanas a {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dict_rev[nearest[k]]
                log_string = "%s %s, "%(log_string, close_word)
            print(log_string)

In [None]:
plt.plot(loss_x_vect, loss_vect, 'k-', label = "Función de pérdidas")

In [None]:
texts[0]

In [None]:
word_dict["rock"], word_dict["destined"], word_dict["st"], word_dict["new"], word_dict["conan"]

- Si tomamos la palabra "destined" como objetivo, es la número 2457
- Con una ventana de dimensión 1 -> 535 y 2251

In [None]:
M = session.run(embeddings)

In [None]:
M[2457]

In [None]:
M[535]