## Continuando con caso de estudio: Amazon-Yelp-Imdb

In [21]:
import pandas as pd 
import numpy as np 
import re
import string
import nltk
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer

### **1. Cargar datos**

In [22]:
# Cargamos los datos, observamos que para el archivo de IMDB 
# el separador es de 3 o más espacios y no un tabulador:

dfa = pd.read_csv('amazon5.txt', sep='\t', names=['review','label'], header=None, encoding='utf-8')
dfi = pd.read_csv('imdb5.txt', sep=r'\s{3,}', names=['review','label'], header=None, encoding='utf-8', engine='python')
dfy = pd.read_csv('yelp5.txt', sep='\t', names=['review','label'], header=None, encoding='utf-8')

# verificamos total de registros
print('Total de registros Amazon: ', dfa.shape)
print('Total de registros IMDB: ', dfi.shape)
print('Total de registros Yelp: ', dfy.shape)

Total de registros Amazon:  (1000, 2)
Total de registros IMDB:  (1000, 2)
Total de registros Yelp:  (1000, 2)


In [23]:
# unimos los 3 dataframes
df = pd.concat([dfa, dfi, dfy], ignore_index=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3000 non-null   object
 1   label   3000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB


### **2. Limpieza y lematización**

In [24]:
def clean_tok(doc):
  # Elimina los signos de puntuación, caracteres especiales y números
  tokens = re.sub(r'[^A-Za-z\s]', ' ', doc).strip()

  # Elimina espacios en blanco adicionales y convierte a minúsculas
  tokens = ' '.join(re.findall(r'\b\w+\b', tokens.lower()))
  
  # Tokeniza por palabras, elimina stopwords y palabras de longitud menor a 1
  tokens = [w for w in tokens.split() if w not in stopwords.words('english') and len(w) > 1]

  return tokens

In [25]:
# Separamos las palabras de las etiquetas

X = df.review 
Y = df.label

In [26]:
# aplicamos la función clean_tok a cada registro de X

Xcleantok = [clean_tok(x) for x in X] 

Obsrvamos los primeros registros 

In [27]:
for x in Xcleantok[0:10]:
  print(x)

['way', 'plug', 'us', 'unless', 'go', 'converter']
['good', 'case', 'excellent', 'value']
['great', 'jawbone']
['tied', 'charger', 'conversations', 'lasting', 'minutes', 'major', 'problems']
['mic', 'great']
['jiggle', 'plug', 'get', 'line', 'right', 'get', 'decent', 'volume']
['several', 'dozen', 'several', 'hundred', 'contacts', 'imagine', 'fun', 'sending', 'one', 'one']
['razr', 'owner', 'must']
['needless', 'say', 'wasted', 'money']
['waste', 'money', 'time']


### Lematización

In [28]:
wnl = WordNetLemmatizer() 

def lemmatizer(doc):

  Xtmp = [wnl.lemmatize(w, 'v') for w in doc]     # lemmatiza verbos
  Xtmp = [wnl.lemmatize(w, 'a') for w in Xtmp]    # lemmatiza adjetivos
  Xtmp = [wnl.lemmatize(w, 'n') for w in Xtmp]    # lemmatiza sustantivos
  Xtmp = [wnl.lemmatize(w, 'r') for w in Xtmp]    # lemmatiza adverbios
  
  return Xtmp

In [29]:
# aplicamos la función lemmatizer a cada registro de Xcleantok

Xclean = [lemmatizer(x) for x in Xcleantok]

In [30]:
for x in Xclean[0:10]:
    print(x)

['way', 'plug', 'u', 'unless', 'go', 'converter']
['good', 'case', 'excellent', 'value']
['great', 'jawbone']
['tie', 'charger', 'conversation', 'last', 'minute', 'major', 'problem']
['mic', 'great']
['jiggle', 'plug', 'get', 'line', 'right', 'get', 'decent', 'volume']
['several', 'dozen', 'several', 'hundred', 'contact', 'imagine', 'fun', 'send', 'one', 'one']
['razr', 'owner', 'must']
['needle', 'say', 'waste', 'money']
['waste', 'money', 'time']


### **3. Train-Validation-Test split**

In [34]:
from sklearn.model_selection import train_test_split 

x_train, x_val_test, y_train, y_val_test = train_test_split(Xclean, Y, test_size=0.30, shuffle=True, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.50, shuffle=True, random_state=42) 

# Verificamos el tamaño de los conjuntos de datos de entrenamiento, validación y prueba
print('X,y Train:', len(x_train), len(y_train)) 
print('X,y Val:', len(x_val), len(y_val))
print('X,y Test', len(x_test), len(y_test))

X,y Train: 2100 2100
X,y Val: 450 450
X,y Test 450 450
