In [53]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras

import numpy as np

print(tf.__version__)

import pandas as pd
import nltk
import string



1.14.0


In [0]:
with open("train_data.txt", "r") as f:
  lines = f.readlines()
  #print(lines[2])
  films = []
  for line in lines:
    arr = [el.strip() for el in line.split(':::')]
    films.append({
        'title': arr[1],
        'genre': arr[2],
        'description': arr[3],
        'index': int(arr[0])
    })
    

In [0]:
print(films[0])

{'title': 'Oscar et la dame rose (2009)', 'genre': 'drama', 'description': 'Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.', 'index': 1}


In [0]:
target = [item['genre'] for item in films]
data = [item['description'] for item in films]

In [6]:
data[0]

'Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.'

In [7]:
target[0]

'drama'

In [8]:
len(data)

54214

In [0]:
train_data = data[:50000]
test_data = data[50000:]
train_target = target[:50000]
test_target = target[50000:]

In [0]:
all_genres = ['news', 'musical', 'drama', 'romance', 
              'war', 'biography', 'sci-fi', 'thriller', 
              'fantasy', 'documentary', 'reality-tv', 
              'adventure', 'mystery', 'action', 'sport', 
              'horror', 'comedy', 'short', 'western', 
              'talk-show', 'adult', 'game-show', 'music', 
              'history', 'crime', 'family', 'animation']

def encode_genres(genres):
  result = []
  for genre in genres:
    index = all_genres.index(genre)
    elem = [0] * len(all_genres)
    elem[index] = 1
    result.append(elem)
  return result

In [0]:
train_target_encoded = encode_genres(train_target)
test_target_encoded = encode_genres(test_target)

In [105]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [0]:
train_data_tokenized = [list(tokenize(doc)) for doc in train_data]

In [0]:
test_data_tokenized = [list(tokenize(doc)) for doc in test_data]

In [109]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_data_tokenized)]
model = Doc2Vec(documents, vector_size=250, window=4, min_count=1, workers=4)

W0730 12:29:12.724248 139948857272192 base_any2vec.py:723] consider setting layer size to a multiple of 4 for greater performance


In [0]:
train_data_vectors = [model.infer_vector(v) for v in train_data_tokenized]

In [0]:
test_data_vectors = [model.infer_vector(v) for v in test_data_tokenized]

In [112]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=250, max_depth=30, random_state=42)
clf.fit(train_data_vectors, train_target_encoded)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=30, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [0]:
predictions = clf.predict_proba(test_data_vectors)

In [0]:
def get_result(predictions):
  result = []
  for i in range(len(predictions[0])):
    pred_proba = [p[i][1] for p in predictions]
    index = np.argmax(pred_proba)
    genre = all_genres[index]
    result.append(genre)
  return result

In [0]:
result_ = get_result(predictions)

In [151]:
print(result_[100:110])
print(test_target[100:110])

['drama', 'drama', 'drama', 'drama', 'drama', 'drama', 'comedy', 'short', 'documentary', 'drama']
['drama', 'short', 'horror', 'romance', 'drama', 'drama', 'family', 'documentary', 'horror', 'documentary']


In [0]:
def get_score(data, preds):
  points = 0
  for i in range(len(data)):
    if data[i] == preds[i]:
      points += 1
  return points/len(data) * 100

In [156]:
get_score(result_, test_target)

43.75889890840057

In [0]:
with open("test_data.txt", "r") as f:
  lines = f.readlines()
  val_films = []
  for line in lines:
    arr = [el.strip() for el in line.split(':::')]
    val_films.append({
        'title': arr[1],
        'description': arr[2],
        'index': int(arr[0])
    })
    
val_data = [item['description'] for item in val_films]

In [0]:
val_data_tokenized = [list(tokenize(doc)) for doc in val_data]
val_data_vectors = [model.infer_vector(v) for v in val_data_tokenized]

In [0]:
val_predictions = clf.predict_proba(val_data_vectors)

In [0]:
val_result_ = get_result(val_predictions)

In [0]:
i = 1    
sub_preds = []
for r in val_result_:
  sub_preds.append([str(i), r])
  i+=1
  
df = pd.DataFrame(sub_preds, columns = ['id', 'genre'])
df.to_csv('submission.csv', index=False)