In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import numpy as np
import io
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, Flatten
from sklearn import preprocessing
import pickle
nltk.download('stopwords')
nltk.download('wordnet')

ModuleNotFoundError: No module named 'matplotlib'

### Import Metadata CSV

In [None]:
metadata = pd.read_csv(os.getcwd() + '\movies_metadata.csv')

In [None]:
metadata.columns

In [None]:
input_data = pd.DataFrame()
input_data['id'] = metadata['id'].copy()
input_data['title'] = metadata['title'].copy()
input_data['genres'] = metadata['genres'].copy()

In [None]:
input_data['genre'] = ''
input_data

In [None]:
### Just take first word
for i in range(input_data.shape[0]):
    
    json_str = input_data.iloc[i,2]
    json_str = json_str.strip('[]')
    json_str = json_str.replace('{','')
    json_str = json_str.replace('}','')
    json_list = json_str.split(',')
    for j in json_list[1:3:2]:
        word = j.split(':')[1].strip(' ').strip('\'')
    
    #words = word.strip(' ')
    input_data.iloc[i,3] = word

In [None]:
del input_data['genres']
input_data

### Import Keywords CSV

In [None]:
keywords = pd.read_csv(os.getcwd() + '\keywords.csv')

In [None]:
keywords.columns

In [None]:
keywords

In [None]:
keywords['keywords_string'] = ''

In [None]:
keywords.head

In [None]:
for i in range(keywords.shape[0]):
    #for i in range(1):
    words = ''
    json_str = keywords.iloc[i,1]
    json_str = json_str.strip('[]')
    json_str = json_str.replace('{','')
    json_str = json_str.replace('}','')
    json_list = json_str.split(',')

    for j in json_list[1::2]:
        word = j.split(':')[1].strip(' ').strip('\'')
        words += word + ' '
    
    words = words.strip(' ')
    keywords.iloc[i,2] = words

In [None]:
del keywords['keywords']

In [None]:
keywords

In [None]:
keywords.dtypes

### Concat Data

In [None]:
keywords['id'] = keywords['id'].astype(str)

In [None]:
keywords.dtypes

In [None]:
movie_data = pd.merge(input_data, keywords, on='id')

In [None]:
nan_value = float("NaN")
movie_data.replace("", nan_value, inplace=True)
movie_data.dropna(subset = ["title", "genre", "keywords_string"], inplace=True)

### Create Test Data

In [None]:
movie_data, test_movie_data = movie_data[:30656,:], movie_data[30656:,:]

### Create Input Strings

In [None]:
movie_data['input'] = movie_data['title'] + ' keywords ' + movie_data['keywords_string']

In [None]:
movie_data.drop(labels=['title', 'keywords_string', 'id'], axis=1, inplace=True)

### Create Test Data

In [None]:
movie_data = movie_data.sample(frac=1).reset_index(drop=True)

In [None]:
movie_data, movie_data_test = movie_data.iloc[:30656,:], movie_data.iloc[30656:,:]

In [None]:
movie_data_test.to_csv('movie_data_test.csv')

 ### Preprocessing

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(text):
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [word for word in text if  not word in stop_words]
    text = " ".join(text)
    return text

In [None]:
movie_data['Processed_input'] = movie_data.input.apply(lambda x: clean_text(x))

In [None]:
X = movie_data['Processed_input']

In [None]:
Y = movie_data['genre']
le = preprocessing.LabelEncoder()
le.fit(movie_data['genre'])
Y = le.transform(movie_data['genre'])
max(Y)

In [None]:
with open('Label_encoder.pickle', 'wb') as handle:
    pickle.dump(le, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(movie_data['Processed_input'])
encoded_docs = tokenizer.texts_to_sequences(movie_data['Processed_input'])

In [None]:
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# pad sequences
max_length = max([len(s.split()) for s in movie_data['Processed_input']])
X = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(max_length)

### Split Data

In [None]:
X_train, X_val = X[:28656,:], X[28656:,:]

In [None]:
Y_train, Y_val = Y[:28656], Y[28656:]

### Build Neural Net

In [None]:
embeddings_dict = {}
with open("glove.6B.200d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
embedding_matrix = np.zeros((vocab_size, 200))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=200,
                           weights = [embedding_matrix],
                           input_length=max_length,
                           trainable=True))
#model.add(layers.Conv1D(filters=256, kernel_size=3, activation='relu'))
#model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(20, activation='sigmoid'))

In [None]:
# compile network
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Train model with early stopping and save best model
past_accuracy = 0
session = 0
while True:
    model.fit(X_train, Y_train, batch_size=128, epochs=1, verbose=2)
    loss, accuracy = model.evaluate(X_val, Y_val)
    if past_accuracy < accuracy:
        past_accuracy = accuracy
        model_save = model
    else:
        session += 1
        if session == 3:
            break

In [None]:
model_save.save(os.getcwd() + '\movie_genre_model')