## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import Callback, EarlyStopping

## Data Loading

In [3]:
train_df = pd.read_csv('../data/processed/train_data_processed.csv', index_col='id')
train_df.head()

Unnamed: 0_level_0,title,genre,description,year,description_stemmed,description_stemmed_string
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Oscar et la dame rose,drama,Listening in to a conversation between his do...,2009,"['listen', 'convers', 'doctor', 'parent', '10-...",listen convers doctor parent 10-year-old oscar...
2,Cupid,thriller,A brother and sister with a past incestuous r...,1997,"['brother', 'sister', 'past', 'incestu', 'rela...",brother sister past incestu relationship curre...
3,"Young, Wild and Wonderful",adult,As the bus empties the students for their fie...,1980,"['bu', 'empti', 'student', 'field', 'trip', 'm...",bu empti student field trip museum natur histo...
4,The Secret Sin,drama,To help their unemployed father make ends mee...,1915,"['help', 'unemploy', 'father', 'make', 'end', ...",help unemploy father make end meet edith twin ...
5,The Unrecovered,drama,The film's title refers not only to the un-re...,2007,"['film', ""'s"", 'titl', 'refer', 'un-recov', 'b...",film 's titl refer un-recov bodi ground zero a...


In [4]:
unique_genres = train_df['genre'].unique()
genre_encoding = dict([(unique_genres[i], i) for i in range(unique_genres.shape[0])])
train_df['genre'] = train_df['genre'].map(genre_encoding)
train_df = train_df.sample(frac=1, random_state=2)
train_df

Unnamed: 0_level_0,title,genre,description,year,description_stemmed,description_stemmed_string
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
49673,The Empire Builders,0,A bizarre couple and their seventeen-year-old...,2010,"['bizarr', 'coupl', 'seventeen-year-old', 'dau...",bizarr coupl seventeen-year-old daughter trap ...
48950,The Unknown (20,12,The world has been taken over by The Unknown....,9/II,"['world', 'taken', 'unknown', 'one', 'know', '...",world taken unknown one know done new set rule...
15205,"""Nanny Mother""",0,The drama tells the story about a variety of ...,2009,"['drama', 'tell', 'stori', 'varieti', 'entangl...",drama tell stori varieti entangl sever famili ...
41475,Pan Vok odchází,4,Petr Vok ('Martin Ruzek' (qv)) changes entire...,1979,"['petr', 'vok', ""'martin"", 'ruzek', ""'"", 'qv',...",petr vok 'martin ruzek ' qv chang entir death ...
39977,Shadows (??,7,Jeremy moves his family consisting of his son...,?/IX,"['jeremi', 'move', 'famili', 'consist', 'son',...",jeremi move famili consist son ben 17 daughter...
...,...,...,...,...,...,...
44567,Codex Atlanticus,3,The true about the folios in the Codex Atlant...,2013,"['true', 'folio', 'codex', 'atlanticu', 'deal'...",true folio codex atlanticu deal variou subject...
30281,"""Undercover Boss Australia""",6,Top executives from high profile Australian b...,2010,"['top', 'execut', 'high', 'profil', 'australia...",top execut high profil australian busi go unde...
6638,Youthful Folly,0,Nancy is a restless young girl tired of livin...,1920,"['nanci', 'restless', 'young', 'girl', 'tire',...",nanci restless young girl tire live plantat th...
35344,Sekai de ichiban utsukushii yoru,0,"Its streets overrun with children, the villag...",2008,"['street', 'overrun', 'children', 'villag', 'k...",street overrun children villag kanam distingui...


In [5]:
num_labels = train_df['genre'].nunique()
num_labels

27

In [6]:
x = train_df['description_stemmed_string'].values
y = pd.get_dummies(train_df['genre']).values

In [7]:
train_set_size = 10000
val_set_size = 2000
test_set_size = 2000

x_train = x[:train_set_size]
x_val = x[train_set_size:train_set_size+val_set_size]
x_test = x[train_set_size+val_set_size:train_set_size+val_set_size+test_set_size]

y_train = y[:train_set_size]
y_val = y[train_set_size:train_set_size+val_set_size]
y_test = y[train_set_size+val_set_size:train_set_size+val_set_size+test_set_size]

## Tokenization

In [8]:
num_words = 2**8
max_len = 20

tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=max_len)

x_val = tokenizer.texts_to_sequences(x_val)
x_val = pad_sequences(x_val, maxlen=max_len)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=max_len)

## Training

In [9]:
EMBEDDING_DIM = 2**4

# model = load_model('../models/lstm_model')
model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, input_length=x_train.shape[1]))
model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.2))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Metal device set to: Apple M1 Pro


2022-10-23 23:36:23.092215: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-10-23 23:36:23.092325: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [11]:
my_callbacks  = [EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=2,
                              mode='auto')]

history = model.fit(x_train, y_train,
                    epochs=6, batch_size=32,
                    validation_data=(x_val, y_val),
                    callbacks=my_callbacks,
                    verbose=1
                   )



## Evaluation

In [12]:
model.evaluate(x_test, y_test)

2022-10-23 23:36:25.467233: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-10-23 23:36:25.621841: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




[1.979418158531189, 0.4350000321865082]

In [13]:
model.save('../models/lstm_model')

INFO:tensorflow:Assets written to: ../models/lstm_model/assets
