In [114]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras

import numpy as np

print(tf.__version__)

import pandas as pd
import nltk
import string
from collections import Counter
from nltk.corpus import stopwords

1.14.0


In [0]:
DESCRIPTION_MAX_LENGTH = 324

In [0]:
with open("train_data.txt", "r") as f:
  lines = f.readlines()
  #print(lines[2])
  films = []
  for line in lines:
    arr = [el.strip() for el in line.split(':::')]
    films.append({
        'title': arr[1],
        'genre': arr[2],
        'description': arr[3],
        'index': int(arr[0])
    })
    

In [0]:
target = [item['genre'] for item in films]
data = [item['description'] for item in films]

In [0]:
train_data = data[:50000]
test_data = data[50000:]
train_target = target[:50000]
test_target = target[50000:]

In [0]:
all_genres = ['news', 'musical', 'drama', 'romance', 
              'war', 'biography', 'sci-fi', 'thriller', 
              'fantasy', 'documentary', 'reality-tv', 
              'adventure', 'mystery', 'action', 'sport', 
              'horror', 'comedy', 'short', 'western', 
              'talk-show', 'adult', 'game-show', 'music', 
              'history', 'crime', 'family', 'animation']

def encode_genres(genres):
  result = []
  for genre in genres:
    index = all_genres.index(genre)
    elem = [0] * len(all_genres)
    elem[index] = 1
    result.append(elem)
  return result

In [0]:
train_target_encoded = encode_genres(train_target)
test_target_encoded = encode_genres(test_target)

In [121]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [122]:
stop_words = set(stopwords.words('english'))
train_data_ = [' '.join([word for word in text.replace('-', ' ').translate(str.maketrans('', '', string.punctuation)).lower().split() if word not in stop_words]) for text in train_data]
test_data_ = [' '.join([word for word in text.replace('-', ' ').translate(str.maketrans('', '', string.punctuation)).lower().split() if word not in stop_words]) for text in test_data]
print(train_data_[1])
text = ' '.join(train_data_)
words = text.split()

brother sister past incestuous relationship current murderous relationship murders women reject murders women get close


In [123]:
counter = Counter(words)
#print(counter)
#print(counter.most_common())
ordered_words = [pair[0] for pair in counter.most_common()]
print(ordered_words)
VOCAB_SIZE = len(ordered_words)



In [124]:
word_index = { word: i for i, word in enumerate(ordered_words) }
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
print(word_index["<PAD>"])
print(reverse_word_index)

Output hidden; open in https://colab.research.google.com to view.

In [125]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

def encode_review(text):
    words = text.lower().split()
    words = ['<START>'] + words
    idxs = [word_index.get(word, word_index['<UNK>']) for word in words]
    return idxs

print('Example of a encoded review: \n{}'.format(encode_review(train_data_[0])))

Example of a encoded review: 
[1, 5097, 1947, 432, 122, 666, 29, 17, 2473, 338, 2215, 1316, 346, 1061, 52, 4253, 839, 1357, 750, 1783, 1500, 1352, 1290, 687, 4680, 107, 462, 8566, 755, 2602, 1290, 766, 10776, 475, 596, 9307, 1809, 4921, 3555, 1773, 2473, 52, 3, 12, 277, 244, 31, 1081, 12298, 15026, 12917, 370, 2945, 7442, 1183]


In [0]:
train_data_encoded = [encode_review(text) for text in train_data_]
test_data_encoded = [encode_review(text) for text in test_data_]

In [0]:
train_data_encoded = tf.keras.preprocessing.sequence.pad_sequences(
    train_data_encoded,
    value=word_index["<PAD>"],
    padding='post',
    maxlen=DESCRIPTION_MAX_LENGTH)

test_data_encoded = tf.keras.preprocessing.sequence.pad_sequences(
    test_data_encoded,
    value=word_index["<PAD>"],
    padding='post',
    maxlen=DESCRIPTION_MAX_LENGTH)

In [128]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=False)),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(len(all_genres), activation=tf.nn.softmax),
])

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 32)          4099168   
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 64)          16640     
_________________________________________________________________
bidirectional_5 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_9 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_10 (Dense)             (None, 27)                1755      
Total params: 4,146,555
Trainable params: 4,146,555
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

In [0]:
x_val = train_data_encoded[:5000]
partial_x_train = train_data_encoded[5000:]

y_val = train_target_encoded[:5000]
partial_y_train = train_target_encoded[5000:]

In [139]:
BATCH_SIZE = 512
NUM_EPOCHS = 1

history = model.fit(np.array(partial_x_train),
                    np.array(partial_y_train),
                    epochs=NUM_EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(np.array(x_val), np.array(y_val)),
                    verbose=1)

Train on 45000 samples, validate on 5000 samples


In [0]:
predictions = model.predict(test_data_encoded)


In [0]:
def get_result(predictions):
  result = []
  for i in range(len(predictions)):
    index = np.argmax(predictions[i])
    genre = all_genres[index]
    result.append(genre)
  return result

In [0]:
result_ = get_result(predictions)

In [143]:
print(result_[100:110])
print(test_target[100:110])

['drama', 'drama', 'comedy', 'comedy', 'drama', 'comedy', 'documentary', 'documentary', 'horror', 'drama']
['drama', 'short', 'horror', 'romance', 'drama', 'drama', 'family', 'documentary', 'horror', 'documentary']


In [0]:
def get_score(data, preds):
  points = 0
  for i in range(len(data)):
    if data[i] == preds[i]:
      points += 1
  return points/len(data) * 100

In [145]:
get_score(result_, test_target)

44.58946369245373

In [0]:
with open("test_data.txt", "r") as f:
  lines = f.readlines()
  val_films = []
  for line in lines:
    arr = [el.strip() for el in line.split(':::')]
    val_films.append({
        'title': arr[1],
        'description': arr[2],
        'index': int(arr[0])
    })
    
val_data = [item['description'] for item in val_films]
val_data_ = [' '.join([word for word in text.replace('-', ' ').translate(str.maketrans('', '', string.punctuation)).lower().split() if word not in stop_words]) for text in val_data]
val_data_encoded = [encode_review(text) for text in val_data_]
val_data_encoded = tf.keras.preprocessing.sequence.pad_sequences(
    val_data_encoded,
    value=word_index["<PAD>"],
    padding='post',
    maxlen=DESCRIPTION_MAX_LENGTH)

In [0]:
val_predictions = model.predict(np.array(val_data_encoded))

In [0]:
val_result_ = get_result(val_predictions)

In [0]:
i = 1    
sub_preds = []
for r in val_result_:
  sub_preds.append([str(i), r])
  i+=1
  
df = pd.DataFrame(sub_preds, columns = ['id', 'genre'])
df.to_csv('submission2.csv', index=False)