In [23]:
%config IPCompleter.greedy = True
%autosave 60

Autosaving every 60 seconds


## 1. NLP

### Cargar archivo

In [1]:
import pandas as pd
import os
ruta_archivo = os.path.join("imdb_dataset.csv")
df_criticas = pd.read_csv(ruta_archivo, encoding='iso-8859-2').sample(100, replace=False)
# df_criticas = pd.read_csv(ruta_archivo, encoding='iso-8859-2')
df_criticas

Unnamed: 0,Review,Label
40945,Saw this film in August at the 27th Annual Nat...,pos
29766,It's impossible for me to objectively consider...,neg
6559,Pretty bad. This film about a grizzled(and fra...,neg
2366,I loved this movie 10 years ago when I was abo...,neg
46335,"Even though there are no new episodes, and it ...",pos
...,...,...
5803,...in a TV-movie 70's kind of way. It's one of...,neg
37516,"The Night Listener held my attention, with Rob...",pos
32925,I am a big movie fan. I like movies of all typ...,neg
41117,"Almost too well done... ""John Carpenter's Vamp...",pos


### Palabras de parada

In [2]:
import nltk
import string
# nltk.download('punkt')
# nltk.download('stopwords')
palabras_de_parada_ingles = set(nltk.corpus.stopwords.words('english') + list(string.punctuation) + ['...'])
palabras_de_parada_ingles

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '...',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself'

### Normalizar el texto

In [26]:
# pip install pycontractions

In [3]:
import re

def clean_html(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def normalizar_critica(critica):
    tokens = nltk.tokenize.casual.casual_tokenize(critica, "english")
    tokens_normalizados = [token.lower() for token in tokens if token not in palabras_de_parada_ingles]
    return " ".join(tokens_normalizados)

def normalizar_fila(fila):
    nueva_fila = fila
    critica_normalizada = clean_html(fila['Review'])
    critica_normalizada = normalizar_critica(critica_normalizada)
    nueva_fila['Review'] = critica_normalizada
    return nueva_fila

In [4]:
df_criticas_normalizadas = df_criticas.apply(normalizar_fila, axis=1)
df_criticas_normalizadas

Unnamed: 0,Review,Label
40945,saw film august 27th annual national associati...,pos
29766,it's impossible objectively consider movie not...,neg
6559,pretty bad this film grizzled frankly rock stu...,neg
2366,i loved movie 10 years ago i 16 years old my b...,neg
46335,even though new episodes rarely showed i used ...,pos
...,...,...
5803,tv-movie 70 kind way it's one movies show wee ...,neg
37516,the night listener held attention robin willia...,pos
32925,i big movie fan i like movies types this argua...,neg
41117,almost well done john carpenter's vampires ent...,pos


### Obtener el vocabulario del problema

In [5]:
def obtener_vocabulario_problema():
    cuerpo_texto_normalizado = ' '.join(df_criticas_normalizadas['Review'].tolist())
    vocabulario_problema = cuerpo_texto_normalizado.split()
    vocabulario_ordenado = sorted(set(vocabulario_problema))
    return vocabulario_ordenado

def obtener_vocabulario_problema_y_posicion(vocabulario_del_problema_ordenado):
    vocabulario_y_posicion = {}
    for i, token in enumerate(vocabulario_del_problema_ordenado):
        vocabulario_y_posicion[token] = i
    return vocabulario_y_posicion

In [6]:
vocabulario_problema = obtener_vocabulario_problema_y_posicion(obtener_vocabulario_problema())
#vocabulario_problema.keys()

### One Hot Encoding

In [20]:
import numpy as np

def obtener_one_hot_vector(critica, vocabulario_problema_y_posicion):
    one_hot_vector_vector = np.zeros(len(vocabulario_problema_y_posicion), dtype=int)
    for token in critica.split():
        one_hot_vector_vector[vocabulario_problema_y_posicion[token]] = 1
    return one_hot_vector_vector

In [21]:
one_hots = []
indices = []
for index, row in df_criticas_normalizadas.iterrows():
    one_hot = obtener_one_hot_vector(row['Review'], vocabulario_problema)
    indices.append(index)
    one_hots.append(one_hot)

## 2. Crear y entrenar el modelo predictivo

In [29]:
df_criticas_one_hot_vector = pd.DataFrame(columns=vocabulario_problema.keys(), data=one_hots, index=indices)
df_criticas_one_hot_vector['Target'] = df_criticas_normalizadas['Label']
X = df_criticas_one_hot_vector.drop('Target', axis=1).to_numpy()
Y = df_criticas_one_hot_vector['Target'].ravel()

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(70, 4639) (70,)
(30, 4639) (30,)
