## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GRU, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import Callback, EarlyStopping

In [3]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

2022-11-29 21:58:46.816980: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


## Data Loading

In [4]:
train_df = pd.read_csv('../data/processed/train_data_processed.csv', index_col='id')
train_df.head()

Unnamed: 0_level_0,title,genre,description,year,processed_description,processed_description_string,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Oscar et la dame rose,drama,Listening in to a conversation between his do...,2009,"['listen', 'convers', 'doctor', 'parent', '10-...",listen convers doctor parent 10-year-old oscar...,English
2,Cupid,thriller,A brother and sister with a past incestuous r...,1997,"['brother', 'sister', 'past', 'incestu', 'rela...",brother sister past incestu relationship curre...,English
3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fie...,1980,"['bu', 'empti', 'student', 'field', 'trip', 'm...",bu empti student field trip museum natur histo...,English
4,The Secret Sin,drama,To help their unemployed father make ends mee...,1915,"['help', 'unemploy', 'father', 'make', 'end', ...",help unemploy father make end meet edith twin ...,English
5,The Unrecovered,drama,The film's title refers not only to the un-re...,2007,"['film', 'titl', 'refer', 'un-recov', 'bodi', ...",film titl refer un-recov bodi ground zero also...,English


In [5]:
# Filter out non-English languages
print(train_df.shape)
train_df = train_df[train_df['language'] == 'English'].reset_index(drop=True)
print(train_df.shape)

(54214, 7)
(53831, 7)


In [6]:
unique_genres = train_df['genre'].unique()
genre_encoding = dict([(unique_genres[i], i) for i in range(unique_genres.shape[0])])
train_df['genre'] = train_df['genre'].map(genre_encoding)
train_df = train_df.sample(frac=1)
train_df

Unnamed: 0,title,genre,description,year,processed_description,processed_description_string,language
27882,Hee,12,HEE is a short experimental and subjective fi...,2009,"['hee', 'short', 'experiment', 'subject', 'fil...",hee short experiment subject film explor creat...,English
358,Hit (20,0,Baji is a top notch underground fighting dire...,1/II,"['baji', 'top', 'notch', 'underground', 'fight...",baji top notch underground fight director know...,English
51765,A1C: Agents 1st Class,10,Charlize is a world-class scientist in charge...,2016,"['charliz', 'world-class', 'scientist', 'charg...",charliz world-class scientist charg three worl...,English
20198,La fleur d'oranger,4,"Not content to be an austere judge, M. de Mér...",1932,"['content', 'auster', 'judg', 'de', 'méricourt...",content auster judg de méricourt also domest t...,English
7736,Morning Echo,12,The Moffatts are in trouble. Their sick young...,2010,"['moffatt', 'troubl', 'sick', 'young', 'daught...",moffatt troubl sick young daughter franni almo...,English
...,...,...,...,...,...,...,...
14009,Lake Placid: An Olympic History,3,Twenty-six years after the U.S. Hockey Team s...,2006,"['twenty-six', 'year', 'us', 'hockey', 'team',...",twenty-six year us hockey team shock heavily-f...,English
31857,Townhouse,4,"Jack Madigan is, by many accounts, blessed. H...",????,"['jack', 'madigan', 'mani', 'account', 'bless'...",jack madigan mani account bless still effortle...,English
42845,Agnee,10,Tanisha (Played by Mahiya Mahi) is a incognit...,2014,"['tanisha', 'play', 'mahiya', 'mahi', 'incogni...",tanisha play mahiya mahi incognito serial kill...,English
45895,"""4Jim""",0,The 19 year old dutch Jim has been diagnosed ...,2015,"['19', 'year', 'old', 'dutch', 'jim', 'diagnos...",19 year old dutch jim diagnos cancer undergon ...,English


In [7]:
num_labels = train_df['genre'].nunique()
num_labels

27

In [24]:
x = train_df['processed_description_string'].values
y = pd.get_dummies(train_df['genre']).values

In [25]:
train_set_size = 45000
val_set_size = 3000
test_set_size = 5000

x_train = x[:train_set_size]
x_val = x[train_set_size:train_set_size+val_set_size]
x_test = x[train_set_size+val_set_size:train_set_size+val_set_size+test_set_size]

y_train = y[:train_set_size]
y_val = y[train_set_size:train_set_size+val_set_size]
y_test = y[train_set_size+val_set_size:train_set_size+val_set_size+test_set_size]

## Tokenization

In [26]:
num_words = 2**9
max_len = 2**5

tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=max_len)

x_val = tokenizer.texts_to_sequences(x_val)
x_val = pad_sequences(x_val, maxlen=max_len)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=max_len)

## NN Training

In [27]:
# model = load_model('../models/nn_model')
model = Sequential()
model.add(Dense(128))
model.add(Dropout(0.1))
model.add(Dense(256))
model.add(Dropout(0.1))
model.add(Dense(256))
model.add(Dropout(0.1))
model.add(Dense(128))
model.add(Dropout(0.1))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
my_callbacks  = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=5,
                              mode='auto')]

history = model.fit(x_train, y_train,
                    epochs=50, batch_size=32,
                    validation_data=(x_val, y_val),
                    callbacks=my_callbacks,
                    verbose=1
                   )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


## NN Evaluation

In [29]:
model.evaluate(x_test, y_test)



[2.338596820831299, 0.2648000121116638]

In [30]:
model.save('../models/nn_model')

INFO:tensorflow:Assets written to: ../models/nn_model/assets


INFO:tensorflow:Assets written to: ../models/nn_model/assets


## GRU Training

In [31]:
EMBEDDING_DIM = 2**5

# model = load_model('../models/gru_model')
model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, input_length=x_train.shape[1]))
model.add(GRU(256, dropout=0.1))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
my_callbacks  = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=5,
                              mode='auto')]

history = model.fit(x_train, y_train,
                    epochs=20, batch_size=32,
                    validation_data=(x_val, y_val),
                    callbacks=my_callbacks,
                    verbose=1
                   )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


## GRU Evaluation

In [33]:
model.evaluate(x_test, y_test)



[1.649399757385254, 0.5189999938011169]

In [34]:
model.save('../models/gru_model')



INFO:tensorflow:Assets written to: ../models/gru_model/assets


INFO:tensorflow:Assets written to: ../models/gru_model/assets


## LSTM Training

In [35]:
EMBEDDING_DIM = 2**5

# model = load_model('../models/lstm_model')
model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, input_length=x_train.shape[1]))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
my_callbacks  = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=3,
                              mode='auto')]

history = model.fit(x_train, y_train,
                    epochs=10, batch_size=32,
                    validation_data=(x_val, y_val),
                    callbacks=my_callbacks,
                    verbose=1
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## LSTM Evaluation

In [37]:
model.evaluate(x_test, y_test)



[1.611369252204895, 0.5171999931335449]

In [38]:
model.save('../models/lstm_model')



INFO:tensorflow:Assets written to: ../models/lstm_model/assets


INFO:tensorflow:Assets written to: ../models/lstm_model/assets
