## Imports

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GRU, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import Callback, EarlyStopping

## Data Loading

In [16]:
train_df = pd.read_csv('../data/processed/train_data_processed.csv', index_col='id')
train_df.head()

Unnamed: 0_level_0,title,genre,description,year,processed_description,processed_description_string,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Oscar et la dame rose,drama,Listening in to a conversation between his do...,2009,"['listen', 'convers', 'doctor', 'parent', '10-...",listen convers doctor parent 10-year-old oscar...,English
2,Cupid,thriller,A brother and sister with a past incestuous r...,1997,"['brother', 'sister', 'past', 'incestu', 'rela...",brother sister past incestu relationship curre...,English
3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fie...,1980,"['bu', 'empti', 'student', 'field', 'trip', 'm...",bu empti student field trip museum natur histo...,English
4,The Secret Sin,drama,To help their unemployed father make ends mee...,1915,"['help', 'unemploy', 'father', 'make', 'end', ...",help unemploy father make end meet edith twin ...,English
5,The Unrecovered,drama,The film's title refers not only to the un-re...,2007,"['film', 'titl', 'refer', 'un-recov', 'bodi', ...",film titl refer un-recov bodi ground zero also...,English


In [17]:
# Filter out non-English languages
print(train_df.shape)
train_df = train_df[train_df['language'] == 'English'].reset_index(drop=True)
print(train_df.shape)

(54214, 7)
(53831, 7)


In [18]:
unique_genres = train_df['genre'].unique()
genre_encoding = dict([(unique_genres[i], i) for i in range(unique_genres.shape[0])])
train_df['genre'] = train_df['genre'].map(genre_encoding)
train_df = train_df.sample(frac=1)
train_df

Unnamed: 0,title,genre,description,year,processed_description,processed_description_string,language
29380,Daku Sultana,10,"Sultana, a student, is raped by Ratan Singh a...",2000,"['sultana', 'student', 'rape', 'ratan', 'singh...",sultana student rape ratan singh thakur vikram...,English
15320,Lots & Lots of Toy Trains: Model Railroading ...,18,Huge collection of the best non-stop model ra...,2003,"['huge', 'collect', 'best', 'non-stop', 'model...",huge collect best non-stop model railroad acti...,English
38682,"""Empress""",11,Empress will tell the storied tale of the onl...,2017,"['empress', 'tell', 'stori', 'tale', 'femal', ...",empress tell stori tale femal empress histori ...,English
11221,The New Interns,0,Playboy Alec Considine returns to New North H...,1964,"['playboy', 'alec', 'considin', 'return', 'new...",playboy alec considin return new north hospit ...,English
36297,Nightmare,7,"A mental-patient, who is troubled with horrib...",1981,"['mental-pati', 'troubl', 'horribl', 'nightmar...",mental-pati troubl horribl nightmar escap hosp...,English
...,...,...,...,...,...,...,...
47576,Quitters,4,"In the pilot, Chad tries to use subliminal me...",2008,"['pilot', 'chad', 'tri', 'use', 'sublimin', 'm...",pilot chad tri use sublimin messag get fiancé ...,English
47827,Honolulu: 100 Years in the Making,3,Honolulu is one of the greatest cities in the...,2006,"['honolulu', 'one', 'greatest', 'citi', 'world...",honolulu one greatest citi world celebr 100th ...,English
42593,Cenizas del cielo,0,"Environmental drama set in Asturias, Spain. A...",2008,"['environment', 'drama', 'set', 'asturia', 'sp...",environment drama set asturia spain scottish t...,English
2156,Two Hats,3,"In 1998, Brad and Deborah Wells left the comf...",2012,"['1998', 'brad', 'deborah', 'well', 'left', 'c...",1998 brad deborah well left comfort familiar h...,English


In [19]:
num_labels = train_df['genre'].nunique()
num_labels

27

In [20]:
x = train_df['processed_description_string'].values
y = pd.get_dummies(train_df['genre']).values

In [21]:
train_set_size = 45000
val_set_size = 3000
test_set_size = 5000

x_train = x[:train_set_size]
x_val = x[train_set_size:train_set_size+val_set_size]
x_test = x[train_set_size+val_set_size:train_set_size+val_set_size+test_set_size]

y_train = y[:train_set_size]
y_val = y[train_set_size:train_set_size+val_set_size]
y_test = y[train_set_size+val_set_size:train_set_size+val_set_size+test_set_size]

## Tokenization

In [22]:
num_words = 2**9
max_len = 2**5

tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=max_len)

x_val = tokenizer.texts_to_sequences(x_val)
x_val = pad_sequences(x_val, maxlen=max_len)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=max_len)

## NN Training

In [23]:
# model = load_model('../models/nn_model')
model = Sequential()
model.add(Dense(128))
model.add(Dropout(0.1))
model.add(Dense(256))
model.add(Dropout(0.1))
model.add(Dense(256))
model.add(Dropout(0.1))
model.add(Dense(128))
model.add(Dropout(0.1))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
my_callbacks  = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=5,
                              mode='auto')]

history = model.fit(x_train, y_train,
                    epochs=50, batch_size=32,
                    validation_data=(x_val, y_val),
                    callbacks=my_callbacks,
                    verbose=1
                   )

Epoch 1/50
   1/1407 [..............................] - ETA: 11:17 - loss: 299.9420 - accuracy: 0.0312

2022-11-05 12:03:24.718551: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-11-05 12:03:38.863642: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50


## NN Evaluation

In [25]:
model.evaluate(x_test, y_test)



[2.3809874057769775, 0.2396000176668167]

In [26]:
model.save('../models/nn_model')

INFO:tensorflow:Assets written to: ../models/nn_model/assets


## GRU Training

In [27]:
EMBEDDING_DIM = 2**5

# model = load_model('../models/gru_model')
model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, input_length=x_train.shape[1]))
model.add(GRU(256, dropout=0.1))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
my_callbacks  = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=5,
                              mode='auto')]

history = model.fit(x_train, y_train,
                    epochs=20, batch_size=32,
                    validation_data=(x_val, y_val),
                    callbacks=my_callbacks,
                    verbose=1
                   )

Epoch 1/20


2022-11-05 12:07:31.553806: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-05 12:07:31.724471: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


   1/1407 [..............................] - ETA: 31:15 - loss: 3.2947 - accuracy: 0.0938

2022-11-05 12:07:31.879906: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-11-05 12:08:02.621399: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-05 12:08:02.674566: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


## GRU Evaluation

In [29]:
model.evaluate(x_test, y_test)



[1.7046022415161133, 0.4936000108718872]

In [30]:
model.save('../models/gru_model')



INFO:tensorflow:Assets written to: ../models/gru_model/assets


INFO:tensorflow:Assets written to: ../models/gru_model/assets


## LSTM Training

In [31]:
EMBEDDING_DIM = 2**5

# model = load_model('../models/lstm_model')
model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, input_length=x_train.shape[1]))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
my_callbacks  = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=3,
                              mode='auto')]

history = model.fit(x_train, y_train,
                    epochs=10, batch_size=32,
                    validation_data=(x_val, y_val),
                    callbacks=my_callbacks,
                    verbose=1
                   )

Epoch 1/10


2022-11-05 12:13:35.971314: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-05 12:13:36.112186: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


   1/1407 [..............................] - ETA: 34:08 - loss: 3.2961 - accuracy: 0.0625

2022-11-05 12:13:36.241207: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-11-05 12:14:04.328096: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-05 12:14:04.385657: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## LSTM Evaluation

In [33]:
model.evaluate(x_test, y_test)



[1.6311513185501099, 0.5042000412940979]

In [34]:
model.save('../models/lstm_model')



INFO:tensorflow:Assets written to: ../models/lstm_model/assets


INFO:tensorflow:Assets written to: ../models/lstm_model/assets
