#Preprocesado
Como vimos en el exploración visual de dataset nuestros datos vienen con bastante ruido, vamos realizar una limpieza del data set.

In [1]:
!pip install num2words
!pip install unidecode
!pip install nltk



In [2]:
#Importamos las librerías a utilizar
import pandas as pd
import string
from num2words import num2words
import  re
from unidecode import unidecode
from collections import Counter
import numpy as np
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
import nltk

In [3]:
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#Cargamos los datos
data = pd.read_csv('./data.csv', sep = ",", index_col=0 )


In [5]:
data

Unnamed: 0,Comentario,Valoracion,Sentiment
164848,There isn't much to this set and although it l...,2,0
5130,This is often the game my kids pick when they ...,5,1
83939,These were a huge hit at our carnival themed b...,5,1
61318,My 3 yr old great-niece is in love. This is j...,5,1
134000,I got this for $20 cheaper on Amazon than at T...,3,0
...,...,...,...
81120,"I bought this toy as a gift, and I was quite e...",4,1
81732,My 1 year old grandson loves this little wagon...,4,1
39790,The fumes are really bad from this toy. Wow. M...,1,0
119895,"A big fan of Ravensburger puzzles and games, I...",2,0


### Preprocesamiento
- Limpiaremos las frases (reduciremos minusculas, quitaremos signos de puntuación, espacios en blanco y tildes)
- Transformaremos los digitos a texto
- Quitaremos los stopwords
- Realizaremos una Lematizacion para reducir la variabilidad de las palabras por su derivación

In [47]:
# Frase de Prueba
x = "Hi, this is prueba2 or 3 because I try to work my préproCEsing"

In [6]:
#función para limpiar frases
def clean_sentence(sentence):
  list_words = []
  for word in sentence.split():
      word = word.lower()  #Convertimos en minúsculas
      word = re.sub('[%s]' % re.escape(string.punctuation),'', word) #Quitamos los signos de puntuación de cada palabra
      word = word.strip() #Nos aseguramos de no tener espacios
      word = unidecode(word) #Quitamos los posibles acentos
      if word != '':
        list_words.append(word)
  return list_words


In [48]:
clean_sentence(x)

['hi',
 'this',
 'is',
 'prueba2',
 'or',
 '3',
 'because',
 'i',
 'try',
 'to',
 'work',
 'my',
 'preprocesing']

In [7]:
#función para transformar los digitos en caracteres
def clean_digit(word_list):
  list_words = []
  for word in word_list:
      #Como existen palabras y numeros sin espacio vamos revisar caracter a caracter y a transformalo a número con texto
      #Declaramos una variable auxiliare fuera del bluce
      new_number = ''
      n=0 # Establecemos un contador
      for letter in word:
        n+=1
        if letter.isdigit():#si es un digito lo añadimos new_number
          new_number+=letter
          if n == len(word): #Si es el final de la palabra lo añadimos a lista
            number = num2words(new_number, lang = 'en', ordinal = False)
            number = clean_sentence(number)#Limpiamos el string y creamos una lista con ellos
            for element in number:
              list_words.append(element)# Realizamos un append de los elementos
        else: #Si no es un número la siguiente letra, realizados un append a list_words si new_number no esta vacío
          if new_number != '':
            number = num2words(new_number, lang = 'en', ordinal = False)# Transformamos los digitos a numeros con letra
            new_number = '' #Reiniciamos la variable auxiliar
            number = clean_sentence(number)
            for element in number:
              list_words.append(element)

      #Quitamos los digitos de words
      for digit in string.digits:
        word = word.replace(digit,'') 
      #Y si no queda vacía realizamos el add al set
      if word != '':
        list_words.append(word)
  return list_words

In [49]:
clean_digit(clean_sentence(x))

['hi',
 'this',
 'is',
 'two',
 'prueba',
 'or',
 'three',
 'because',
 'i',
 'try',
 'to',
 'work',
 'my',
 'preprocesing']

In [8]:
#Vemos los stop_words
print(len(STOP_WORDS))
STOP_WORDS

326


{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

Quitamos las palabras 'not'y 'cannot' al tener una carga negativa que puede afectar al sentido de la frase y el sentimiento

In [27]:
#Definimos la función de stop words
def clean_stopwords(sentence, list_stopwords):
  new_list = []
  for word in sentence:
    if word == 'not' or word == 'cannot':
      new_list.append(word)
    elif not word in list_stopwords:
      new_list.append(word)
  return new_list

In [51]:
clean_stopwords(clean_digit(clean_sentence(x)), STOP_WORDS)

['hi', 'prueba', 'try', 'work', 'preprocesing']

In [54]:
#Definimos la función del Lemmatizador
def lemmatice(sentence):
  lemmatizer = WordNetLemmatizer()
  new_list = []
  for word in sentence:
    word = lemmatizer.lemmatize(word)
    new_list.append(word)
  return new_list

In [55]:
lemmatice(clean_stopwords(clean_digit(clean_sentence(x)), STOP_WORDS))

['hi', 'prueba', 'try', 'work', 'preprocesing']

### Pipeline final

In [56]:
def preprocessing(sentence):
  sentence = clean_sentence(sentence)
  sentence = clean_digit(sentence)
  sentence = clean_stopwords(sentence, list_stopwords=STOP_WORDS)
  sentence = lemmatice(sentence)
  sentence = " ".join(sentence)
  return sentence

In [57]:
data['Comentario_Limpio'] = data['Comentario'].apply(lambda x: preprocessing(str(x)))

In [60]:
#Podemos ver la diferencia entre ambos conjuntos
data[['Comentario', 'Comentario_Limpio']]

Unnamed: 0,Comentario,Comentario_Limpio
164848,There isn't much to this set and although it l...,isnt set look like connect monster jam hot whe...
5130,This is often the game my kids pick when they ...,game kid pick want play family board game son ...
83939,These were a huge hit at our carnival themed b...,huge hit carnival themed birthday party played...
61318,My 3 yr old great-niece is in love. This is j...,yr old greatniece love right size perfect pet ...
134000,I got this for $20 cheaper on Amazon than at T...,got cheaper amazon toy r glad defintely not wo...
...,...,...
81120,"I bought this toy as a gift, and I was quite e...",bought toy gift embarrassed packagebeing extre...
81732,My 1 year old grandson loves this little wagon...,year old grandson love little wagon pull walk ...
39790,The fumes are really bad from this toy. Wow. M...,fume bad toy wow daughter nose started run lik...
119895,"A big fan of Ravensburger puzzles and games, I...",big fan ravensburger puzzle game not believe a...


###Preparando los datos para el modelado

Para finalizar esta parte vamos a crear un data set con los datos limipios y preparados para el modelado

In [62]:
data_model = data[['Comentario_Limpio', 'Sentiment']]
data_model.to_csv('./data_model.csv', sep = ",")