In [17]:
import numpy as np
import tensorflow as tf
import keras as k
from keras import Sequential
from keras.layers import Embedding
from tensorflow.contrib.tensorboard.plugins import projector
%matplotlib inline
import matplotlib.pyplot as plt
import os
import math

import warnings
warnings.simplefilter('ignore')

In [2]:
log_dir = '/home/steich/NF_Prize_Data/log'

In [3]:
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

Basic idea to be explored here: the main distinguishing feature of a movie is the people involved in its' making.  If we have crew/cast embeddings we can use those (as a sum-of, or similar to making a document or paragraph vector) to make meaningful movie embeddings.

Cast/Crew embeddings can be made using IMDB data. A Bag of Words model allows us to create embeddings by predicting, for each movie, other cast and crew that worked on that movie. 

Sources:
* https://github.com/tensorflow/tensorflow/blob/r1.6/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
* https://www.tensorflow.org/tutorials/word2vec
* https://cs.stanford.edu/~quocle/paragraph_vector.pdf
* https://arxiv.org/pdf/1301.3781.pdf
* https://arxiv.org/pdf/1507.07998.pdf
* https://www.imdb.com/interfaces/


In [5]:
#First, get the data out of the .tsv files

#using the name.basics.tsv file we can create a 
#cast/crew id -> name dictionary
id_to_name_dict = {}
scalar_to_id_dict = {}
name_count = 1
with open('/home/steich/NF_Prize_Data/data/imdb/name.basics.tsv') as iFile:
    for line in iFile:
        fields = line.split('\t')
        id_to_name_dict[fields[0]] = [fields[1], name_count]
        scalar_to_id_dict[name_count] = fields[0]
        name_count += 1 #scalar id for network

#we can also create a movie_id -> name dictionary
id_to_movie_dict = {}
movie_count = 0
with open('data/imdb/title.basics.tsv') as iFile:
    for line in iFile:
        fields = line.split('\t')
        id_to_movie_dict[fields[0]] = [fields[2], movie_count]
        movie_count += 1

In [6]:
#now we need to build a dictionary of people involved in each movie

#title.principals.tsv has lead/billed actors and directors
#title.crew.tsv has other notable figures who worked on each movie
# - it looks like there is some overlap
#we want a dict like this: movie_dict["movie_id"]=["person_id1", ...]
movie_dict = {}

#with open('data/imdb/title.crew.tsv') as crewFile:
#    for line in crewFile:
#        fields = line.split('\t')
#        directors = []
#        writers = []
#        if '\\' + 'N' not in fields[1]:
#            directors = fields[1].split(',')
#        if '\\' + 'N' not in fields[2]:
#            writers = fields[2].split(',')
#        movie_dict[fields[0]] = directors + writers
        
with open('/home/steich/NF_Prize_Data/data/imdb/title.principals.tsv') as prinFile:
    for line in prinFile:        
        fields = line.split('\t')
        if not movie_dict.get(fields[0]):
            movie_dict[fields[0]] = []
        if fields[2] not in movie_dict[fields[0]]:
            movie_dict[fields[0]].append(fields[2])
            if not id_to_name_dict.get(fields[2]): 
                #looks like there's some unknown/unnamed people
                id_to_name_dict[fields[2]] = ('UNK', name_count)
                name_count += 1


In [10]:
#since we want bag of words training, our input is a name scalar, 
#and our output is a list of the name scalars associted with that name
#on the production the batch was taken from

x_train = []
y_train = []

for movie in movie_dict:
    for person in movie_dict[movie]:
        y = id_to_name_dict[person][1]
        x = [id_to_name_dict[p][1] for p in movie_dict[movie] if p != person]
        if len(x) < 9:
            x += [0 for i in range(9 - len(x))]
        x_train.append(x)
        y_train.append(y)


In [8]:
epochs = 100
batch_size = 32
embedding_size = 128

In [11]:
i = 0
x_train_batches = []
y_train_batches = []
while i + batch_size < len(x_train):
    x_train_batches.append(x_train[i:i+batch_size])
    y_train_batches.append(y_train[i:i+batch_size])
    i+=batch_size

In [12]:
x_train_batches = np.array(x_train_batches)
y_train_batches = np.array(y_train_batches)

In [13]:
x_train_batches.shape

(853718, 32, 9)

In [14]:
name_count

8470526

In [16]:
def nf_onehot_transformation_function(c, class_num):
    
    one_hot = np.zeros(class_num)
    one_hot[c - 1] = 1
    return one_hot

In [17]:
def generate_batch():
    curr_batch = np.random.choice(x_train_batches.shape[0])
    return (x_train_batches[curr_batch], y_train_batches[curr_batch])

In [None]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint

opt = k.optimizers.SGD()

model = Sequential()
model.add(Dense(embedding_size, activation='relu', input_shape=(name_count * 9, ), name="Embedding_Layer"))
model.add(Dense())

In [28]:
graph = tf.Graph()
with graph.as_default():
    
    #input data here
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size, 9])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size])
        #valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        
    with tf.device('/gpu:0'):
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(
                tf.random_uniform([name_count, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
            
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                tf.truncated_normal(
                    [name_count, embedding_size],
                    stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([name_count]))
            
    
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=nce_weights,
                biases=nce_biases,
                inputs=embed,
                labels=train_labels,
                num_sampled=num_sampled,
                num_classes=name_count))
        
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    
    #valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    #similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver()

num_steps = 100001

with tf.Session(graph=graph) as session:
    
    writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)
    
    init.run()
    print('Initialized')
    
    average_loss = 0
    
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch()
        
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        run_metadata = tf.RunMetadata()
        
        _, summary, loss_val = session.run(
            [optimizer, merged, loss],
            feed_dict=feed_dict,
            run_metadata=run_metatdata)
        average_loss += loss_val
        
        writer.add_summary(summary, step)
        if step == (num_steps - 1):
            writer.add_run_metatdata(run_metadata, 'step%d' % step)
            
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
            
            
            
    final_embeddings = normalized_embeddings.eval()
    saver.save(session, os.path.join(log_dir, 'metadata.tsv'))
    
writer.close()


ValueError: Shape must be rank 2 but is rank 1 for 'loss/nce_loss/LogUniformCandidateSampler' (op: 'LogUniformCandidateSampler') with input shapes: [32].