# Excercise 5
## NLP with Keras

Use keras framework to solve the below exercises.


In [1]:
import numpy as np
import keras
import pandas as pd
import matplotlib.pyplot as plt

## 5.1 Predict rating of a movie using Keras

**Exercise:** Use keras framework to predict rating.

In [2]:
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)

In [6]:
dataTraining.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7895 entries, 3107 to 215
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    7895 non-null   int64  
 1   title   7895 non-null   object 
 2   plot    7895 non-null   object 
 3   genres  7895 non-null   object 
 4   rating  7895 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 370.1+ KB


In [3]:
plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [4]:
plots

Unnamed: 0,plot
3107,most is the story of a single father who takes...
900,a serial killer decides to teach the secrets o...
6724,"in sweden , a female blackmailer with a disfi..."
4704,"in a friday afternoon in new york , the presi..."
2582,"in los angeles , the editor of a publishing h..."
...,...
8417,""" our marriage , their wedding . "" it ' s l..."
1592,"the wandering barbarian , conan , alongside ..."
1723,"like a tale spun by scheherazade , kismet fol..."
7605,"mrs . brisby , a widowed mouse , lives in a..."


In [5]:
y

Unnamed: 0,rating
3107,1
900,0
6724,1
4704,1
2582,1
...,...
8417,0
1592,0
1723,0
7605,1


## Data Precosessing

- Remove stopwords
- Lowercase
- split the text in words
- pad_sequences

In [8]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from nltk.corpus import stopwords

# Cargar los datos
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)

# Preprocesamiento
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuaciones y números
    text = re.sub(r'[^a-z\s]', '', text)
    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Aplicar el procesamiento al plot
dataTraining['processed_plot'] = dataTraining['plot'].apply(preprocess_text)

# Obtener X e y
X = dataTraining['processed_plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

# Dividir el conjunto de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenización
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding de las secuencias
max_length = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Build Model

Create a neural network to predict the rating of a movie, calculate the testing set accuracy.

In [9]:
# Definir el modelo
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compilar el modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entrenar el modelo
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluar el modelo
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Accuracy: {accuracy:.4f}')




Epoch 1/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m365s[0m 2s/step - accuracy: 0.5284 - loss: 0.6871 - val_accuracy: 0.6187 - val_loss: 0.6501
Epoch 2/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 2s/step - accuracy: 0.8258 - loss: 0.4080 - val_accuracy: 0.6155 - val_loss: 0.7433
Epoch 3/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 2s/step - accuracy: 0.9686 - loss: 0.1000 - val_accuracy: 0.6084 - val_loss: 1.1655
Epoch 4/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 2s/step - accuracy: 0.9934 - loss: 0.0283 - val_accuracy: 0.6123 - val_loss: 1.6025
Epoch 5/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 2s/step - accuracy: 0.9989 - loss: 0.0058 - val_accuracy: 0.6116 - val_loss: 2.1472
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 570ms/step - accuracy: 0.5759 - loss: 2.3447
Accuracy: 0.5826


In [11]:
pip install pandas numpy tensorflow nltk




In [14]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Descargar stopwords
nltk.download('stopwords')

# Cargar los datos
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)

# Preprocesar el texto
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convertir a minúsculas
    words = text.split()  # Dividir en palabras
    words = [word for word in words if word not in stop_words]  # Eliminar stopwords
    return ' '.join(words)

# Aplicar el preprocesamiento
dataTraining['processed_plot'] = dataTraining['plot'].apply(preprocess_text)

# Tokenización
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataTraining['processed_plot'])
sequences = tokenizer.texts_to_sequences(dataTraining['processed_plot'])

# Pad sequences
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Preparar la variable dependiente
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

# 2. Crear la red neuronal
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))  # Usar sigmoid para clasificación binaria

# Compilar el modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entrenar el modelo
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluar el modelo
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy en el conjunto de prueba: {accuracy:.4f}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/5




[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 694ms/step - accuracy: 0.4941 - loss: 0.6945 - val_accuracy: 0.5435 - val_loss: 0.6901
Epoch 2/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 708ms/step - accuracy: 0.5222 - loss: 0.6929 - val_accuracy: 0.5435 - val_loss: 0.6900
Epoch 3/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 701ms/step - accuracy: 0.5326 - loss: 0.6912 - val_accuracy: 0.5435 - val_loss: 0.6920
Epoch 4/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 697ms/step - accuracy: 0.5097 - loss: 0.6929 - val_accuracy: 0.5435 - val_loss: 0.6905
Epoch 5/5
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 708ms/step - accuracy: 0.5241 - loss: 0.6922 - val_accuracy: 0.5435 - val_loss: 0.6907
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 160ms/step - accuracy: 0.5616 - loss: 0.6894
Accuracy en el conjunto de prueba: 0.5478
