# Proyecto Sistemas Computacionales A-2018: Preprocesamiento del dataset
---

INFO


## Instalación de algunos módulos requeridos

In [16]:
!pip install gensim  # For the Word2Vec model
!pip install tqdm    # Just for using a progress bar
!pip install bokeh   # For graphs



## Importación de todos los módulos requeridos

In [64]:
import pandas as pd
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle
import io
import csv

import gensim
from gensim.models.word2vec import Word2Vec
from gensim.utils import simple_preprocess

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale

from google.colab import files

# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

pd.options.mode.chained_assignment = None
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Subida de dataset al servidor

In [20]:
uploaded = files.upload()

Saving labeled_training_data.csv to labeled_training_data.csv


## Carga del dataset

In [0]:
stopWords = nltk.corpus.stopwords.words('spanish')

def clean_sentence(sentence):
    tokens = [word for word in simple_preprocess(sentence)
              if word not in stopWords]
    
    return tokens 
        

In [57]:
def ingest(datasetFileName):    
    data = pd.read_csv(datasetFileName, header = None)
    data.columns = ['sentence', 'sentiment']
    data['sentiment'] = data['sentiment'].map({
                                                'positivo': 1,
                                                'neutral': 0,
                                                'negativo': -1
                                              })
    data['tokens'] = data['sentence'].progress_map(clean_sentence)
    data.reset_index(inplace = True)
    data.drop('index', axis=1, inplace=True)
    print 'dataset loaded with shape', data.shape    
    return data

data = ingest('labeled_training_data.csv')
data.head(5)


progress-bar:   0%|          | 0/186 [00:00<?, ?it/s][A
progress-bar: 100%|██████████| 186/186 [00:00<00:00, 14051.52it/s][A

dataset loaded with shape (186, 3)


Unnamed: 0,sentence,sentiment,tokens
0,Me siento muy afortunado de tenerte aquí justo...,1,"[siento, afortunado, tenerte, aquí, justo, lado]"
1,Odio a aquellos profesores que creen sabérsela...,-1,"[odio, aquellos, profesores, creen, sabérselas..."
2,El día de hoy se puede ir a hacer compras en e...,0,"[día, hoy, puede, ir, hacer, compras, supermer..."
3,"Hoy quiero conquistar el mundo, tengo mucha en...",1,"[hoy, quiero, conquistar, mundo, mucha, energí..."
4,"Ya no vale la pena seguir intentando, puesto q...",-1,"[vale, pena, seguir, intentando, puesto, enten..."


In [0]:
trainSentences, testSentences, trainLabels, testLabels = \
    train_test_split(np.array(data.tokens),
                     np.array(data.sentiment), test_size = 0.3)

In [59]:
vectorDimension = 200
wordsModel = Word2Vec(trainSentences, size = vectorDimension, min_count = 2,
                      window = 10)
wordsModel.train([train for train in tqdm(trainSentences)],
                 total_examples = len(trainSentences),
                 epochs = 10)


  0%|          | 0/130 [00:00<?, ?it/s][A
100%|██████████| 130/130 [00:00<00:00, 49300.14it/s][A

(1041, 6500)

In [60]:
wordsModel['bien']

  """Entry point for launching an IPython kernel.


array([ 1.0232498e-03,  6.9398759e-04, -1.8653905e-04, -3.6000181e-04,
        2.0625854e-03, -5.3168408e-04, -1.6409608e-03,  1.8015403e-04,
       -1.8961824e-03, -2.1026081e-03, -5.7109806e-04, -2.3602729e-03,
        2.3274859e-03,  2.9960211e-04, -2.0583575e-03,  3.8964325e-05,
       -9.8593638e-04,  1.2868971e-03, -8.7206409e-04,  1.5252185e-03,
       -4.1969289e-04,  8.1185449e-04, -1.7734757e-04, -2.0375694e-04,
        2.1645408e-03,  1.6580237e-04, -3.7743168e-05, -1.9019807e-03,
       -1.1907440e-03,  6.8818836e-04,  1.8039216e-04,  1.0422438e-03,
        7.2743371e-04, -2.1839109e-03,  1.8181158e-03, -1.9909102e-03,
        1.3116513e-03,  2.4253884e-03, -1.4481627e-03, -1.0643147e-03,
        1.1686032e-03, -2.3981811e-04, -3.0734329e-04,  5.0251384e-04,
        7.9056838e-07,  2.4301114e-03, -1.8641945e-03, -9.6055458e-04,
       -1.8880868e-03,  9.7708160e-04,  1.2691772e-04, -2.2908957e-03,
        8.5528554e-05,  1.1422606e-03, -2.8600832e-04,  1.1409007e-03,
      

In [61]:
wordsModel.most_similar('bien')

  """Entry point for launching an IPython kernel.


[(u'depende', 0.20033536851406097),
 (u'hijos', 0.1715807169675827),
 (u'salir', 0.14998483657836914),
 (u'hacer', 0.11825676262378693),
 (u'facultad', 0.11707353591918945),
 (u'pasando', 0.11033233255147934),
 (u'ayer', 0.10704310983419418),
 (u'bueno', 0.09780880808830261),
 (u'necesario', 0.09175516664981842),
 (u'casa', 0.09088844060897827)]

In [62]:
# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width = 700, plot_height = 600,
                       title = 'Map of words vectors',
                       tools = 'pan, wheel_zoom, box_zoom, reset, hover, '
                               'previewsave',
                       x_axis_type = None, y_axis_type = None, min_border = 1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [wordsModel[w] for w in wordsModel.wv.vocab.keys()[:5000]]

# dimensionality reduction. converting the vectors to 2d vectors

tsne_model = TSNE(n_components = 2, verbose = 1, random_state = 0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns = ['x', 'y'])
tsne_df['words'] = wordsModel.wv.vocab.keys()[:5000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x = 'x', y = 'y', source = tsne_df)
hover = plot_tfidf.select(dict(type = HoverTool))
hover.tooltips = {"word": "@words"}
show(plot_tfidf)

  if __name__ == '__main__':


[t-SNE] Computing 90 nearest neighbors...
[t-SNE] Indexed 91 samples in 0.000s...
[t-SNE] Computed neighbors for 91 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 91 / 91
[t-SNE] Mean sigma: 0.006286
[t-SNE] KL divergence after 250 iterations with early exaggeration: 67.165474
[t-SNE] Error after 1000 iterations: 0.968653


In [63]:
print 'building tf-idf matrix ...'
vectorizer = TfidfVectorizer(analyzer = lambda x: x, min_df = 10)
matrix = vectorizer.fit_transform([sentence
                                   for sentence in tqdm(trainSentences)])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print 'vocab size :', len(tfidf)


  0%|          | 0/130 [00:00<?, ?it/s][A
100%|██████████| 130/130 [00:00<00:00, 75187.47it/s][A

building tf-idf matrix ...
vocab size : 1


In [0]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += wordsModel[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [69]:
trainSentencesVectors = np.concatenate([buildWordVector(w, vectorDimension)
                                       for w in tqdm(trainSentences)])
trainSentencesVectors = scale(trainSentencesVectors)

testSentencesVectors = np.concatenate([buildWordVector(w, vectorDimension)
                                      for w in tqdm(testSentences)])
testSentencesVectors = scale(testSentencesVectors)


  

100%|██████████| 130/130 [00:00<00:00, 6230.26it/s][A
  0%|          | 0/56 [00:00<?, ?it/s][A
100%|██████████| 56/56 [00:00<00:00, 8314.67it/s][A

In [71]:
print(len(trainSentences))
print(len(trainSentencesVectors))
print(len(testSentences))
print(len(testSentencesVectors))
trainSentencesVectors[0]

130
130
56
56


array([ 0.28867513, -0.28867513, -0.28867513,  0.28867513, -0.28867513,
       -0.28867513,  0.28867513,  0.28867513,  0.28867513, -0.28867513,
        0.28867513,  0.28867513, -0.28867513, -0.28867513,  0.28867513,
        0.28867513,  0.28867513,  0.28867513, -0.28867513,  0.28867513,
        0.28867513, -0.28867513, -0.28867513,  0.28867513,  0.28867513,
        0.28867513,  0.28867513,  0.28867513,  0.28867513, -0.28867513,
       -0.28867513,  0.28867513,  0.28867513,  0.28867513, -0.28867513,
        0.28867513, -0.28867513, -0.28867513, -0.28867513,  0.28867513,
       -0.28867513,  0.28867513, -0.28867513, -0.28867513, -0.28867513,
       -0.28867513, -0.28867513,  0.28867513,  0.28867513, -0.28867513,
       -0.28867513,  0.28867513, -0.28867513,  0.28867513,  0.28867513,
       -0.28867513, -0.28867513,  0.28867513, -0.28867513,  0.28867513,
        0.28867513,  0.28867513,  0.28867513,  0.28867513, -0.28867513,
        0.28867513,  0.28867513, -0.28867513,  0.28867513,  0.28