In [23]:
%config IPCompleter.greedy = True
%autosave 60

Autosaving every 60 seconds


## 1. NLP

### Cargar archivo

In [24]:
import pandas as pd
import os
ruta_archivo = os.path.join("imdb_dataset.csv")
df_criticas = pd.read_csv(ruta_archivo, encoding='iso-8859-2').sample(100, replace=False)
# df_criticas = pd.read_csv(ruta_archivo, encoding='iso-8859-2')
df_criticas

Unnamed: 0,Review,Label
21899,"If you enjoy riddles and suspense, you will en...",pos
34483,"<br /><br />Back in his youth, the old man had...",neg
1072,The trailer for this film promised a new twist...,neg
38622,- Having grown tired of the rat race and cramp...,pos
34495,"From the opening dialog and scenes, I knew I k...",neg
...,...,...
29975,"I've seen other Guinea Pig films, like Mermaid...",neg
40324,"Preston Waters, a 11 years old boy,has problem...",pos
645,Here's a spoof that's guaranteed to entertain ...,neg
20629,"Have just seen the last episode, No 32, (thoug...",pos


### Palabras de parada

In [25]:
import nltk
import string
# nltk.download('punkt')
# nltk.download('stopwords')
palabras_de_parada_ingles = set(nltk.corpus.stopwords.words('english') + list(string.punctuation) + ['...'])
palabras_de_parada_ingles

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '...',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself'

### Normalizar el texto

In [26]:
# pip install pycontractions

In [27]:
import re

def clean_html(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def normalizar_critica(critica):
    tokens = nltk.tokenize.casual.casual_tokenize(critica, "english")
    tokens_normalizados = [token.lower() for token in tokens if token not in palabras_de_parada_ingles]
    return " ".join(tokens_normalizados)

def normalizar_fila(fila):
    nueva_fila = fila
    critica_normalizada = clean_html(fila['Review'])
    critica_normalizada = normalizar_critica(critica_normalizada)
    nueva_fila['Review'] = critica_normalizada
    return nueva_fila

In [28]:
df_criticas_normalizadas = df_criticas.apply(normalizar_fila, axis=1)
df_criticas_normalizadas

Unnamed: 0,Review,Label
21899,if enjoy riddles suspense enjoy movie truth to...,pos
34483,back youth old man wanted marry first cousin f...,neg
1072,the trailer film promised new twist zombie gen...,neg
38622,having grown tired rat race cramped living con...,pos
34495,from opening dialog scenes i knew i knew i tra...,neg
...,...,...
29975,i've seen guinea pig films like mermaid manhol...,neg
40324,preston waters 11 years old boy problems paren...,pos
645,here's spoof that's guaranteed entertain folks...,neg
20629,have seen last episode no 32 though site says ...,pos


### Obtener el vocabulario del problema

In [29]:
def obtener_vocabulario_problema():
    cuerpo_texto_normalizado = ' '.join(df_criticas_normalizadas['Review'].tolist())
    vocabulario_problema = cuerpo_texto_normalizado.split()
    vocabulario_ordenado = sorted(set(vocabulario_problema))
    return vocabulario_ordenado

def obtener_vocabulario_problema_y_posicion(vocabulario_del_problema_ordenado):
    vocabulario_y_posicion = {}
    for i, token in enumerate(vocabulario_del_problema_ordenado):
        vocabulario_y_posicion[token] = i
    return vocabulario_y_posicion

In [30]:
vocabulario_problema = obtener_vocabulario_problema_y_posicion(obtener_vocabulario_problema())
#vocabulario_problema.keys()

### One Hot Encoding

In [31]:
import numpy as np

def vector_one_hot(critica, vocabulario_problema_y_posicion):
    one_hot_vector_vector = np.zeros(len(vocabulario_problema_y_posicion), dtype=int)
    for token in critica.split():
        one_hot_vector_vector[vocabulario_problema_y_posicion[token]] = 1
    return one_hot_vector_vector

In [47]:
one_hots = []
for index, row in df_criticas_normalizadas.iterrows():
    one_hots.append(vector_one_hot(row['Review'], vocabulario_problema))
    #one_hots.append(Series(index=row.index, data=vector_one_hot(row['Review'], vocabulario_problema))


##one_hots = pd.Series(data=vector_one_hot(), index=df_criticas)

## 2. Crear y entrenar el modelo predictivo

In [48]:
df_change_name = pd.DataFrame(columns=vocabulario_problema.keys(), data=one_hots, index=df_criticas.index)
df_change_name["target"] = df_criticas["Label"]
df_change_name
#df_criticas["Label"]
#df_change_name["target"]

Unnamed: 0,):,);,.,..,0/5,1,1-10,1.5,1/10,10,...,zombie-baby,zombie-hunting,zombie.ok,zombies,zooms,zu,¨,×,â,target
21899,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
34483,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neg
1072,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,neg
38622,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
34495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29975,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,neg
40324,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,neg
20629,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos


AttributeError: module 'pandas' has no attribute 'set_printoptions'