# Excercise 5
## Neural Networks in Keras

Use keras framework to solve the below exercises.


In [36]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras import backend as K
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [37]:
# Descargar los recursos necesarios de NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 5.1 Predict rating of a movie using Keras

**Exercise:** Use keras framework to predict rating.

In [38]:
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)

In [39]:
plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [40]:
plots

Unnamed: 0,plot
3107,most is the story of a single father who takes...
900,a serial killer decides to teach the secrets o...
6724,"in sweden , a female blackmailer with a disfi..."
4704,"in a friday afternoon in new york , the presi..."
2582,"in los angeles , the editor of a publishing h..."
...,...
8417,""" our marriage , their wedding . "" it ' s l..."
1592,"the wandering barbarian , conan , alongside ..."
1723,"like a tale spun by scheherazade , kismet fol..."
7605,"mrs . brisby , a widowed mouse , lives in a..."


In [41]:
y

Unnamed: 0,rating
3107,1
900,0
6724,1
4704,1
2582,1
...,...
8417,0
1592,0
1723,0
7605,1


## Data Precosessing

- Remove stopwords
- Lowercase
- split the text in words
- pad_sequences

### **Remove stopwords & Lowercase**

In [42]:
# Preprocesamiento de texto
def preprocess_text(texts):
    stop_words = set(stopwords.words('english'))
    processed_texts = []

    for text in texts:
        # Convertir a minúsculas, tokenizar y eliminar palabras vacías
        tokens = word_tokenize(text.lower())
        filtered_words = [word for word in tokens if word.isalnum() and word not in stop_words]
        processed_texts.append(" ".join(filtered_words))

    return processed_texts

plots_processed = preprocess_text(dataTraining['plot'].values)
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int).values


### **split the text in words & pad_sequences**

In [43]:
# Tokenización y secuencias con relleno
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(plots_processed)
X = tokenizer.texts_to_sequences(plots_processed)
X_padded = pad_sequences(X, maxlen=200)

In [44]:
# Separar datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)


### **Build Model**

Create a neural network to predict the rating of a movie, calculate the testing set accuracy.

In [45]:
# Construir el modelo
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(units=64))
model.add(Dense(1, activation='sigmoid'))



In [46]:
# Compilar el modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entrenar el modelo
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.5389 - loss: 0.6874 - val_accuracy: 0.5991 - val_loss: 0.6562
Epoch 2/5
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7256 - loss: 0.5374 - val_accuracy: 0.6257 - val_loss: 0.6840
Epoch 3/5
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8598 - loss: 0.3455 - val_accuracy: 0.6143 - val_loss: 0.8010
Epoch 4/5
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9294 - loss: 0.2019 - val_accuracy: 0.5883 - val_loss: 1.0836
Epoch 5/5
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9698 - loss: 0.0989 - val_accuracy: 0.5871 - val_loss: 1.4421


<keras.src.callbacks.history.History at 0x7e371010fd90>

In [47]:
# Evaluar el modelo
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f"Test Accuracy: {test_accuracy}")

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5884 - loss: 1.4679
Test Accuracy: 0.5870804190635681
