# Predicting the imdb rating from the movie data


This script is written by Atluri Laxmi Narayana

This script reads the file "movie_metadata.csv" to predict the imdb score 

This code generates linear model in tensorflow


In [71]:
import pandas as pd
import tensorflow as tf
import numpy as np
import functools
from sklearn.utils import shuffle

In [72]:
df=pd.read_csv("C:\\Users\\Pavan\\desktop\\rang\\final\\movie_metadata.csv")

Target is imdb_score

Removing the data where imdb_score is na


In [73]:
df = df[np.isfinite(df['imdb_score'])]
df = shuffle(df)

We are interested in getting a count of genres and the plotkey words for the movie and how this count impacts the imdb rating
Further, we are interested in learning the effect of title length and the first alphabet/numerical of the title and how it impacts our target  

In [74]:
df["genres_count"]=df.genres.str.split('|', 0).apply(lambda x: len(x))
df["plotkeywords_count"]=df.plot_keywords.str.split('|', 0).apply(lambda x: len(str(x)))
df["genres_array"]=df.genres.str.split('|', 0)
df["plotkeyword_1"],df["plotkeyword_others"]=df.plot_keywords.str.split('|', 1).str
df["movie_title_word_length"]=df.movie_title.str.split(' ', 0).apply(lambda x: len(x))
df["movie_first_alpha_numeric"]=pd.DataFrame(df.movie_title.str.split(' ',1).tolist(),columns = ['first','other'])["first"].str[0]


colums ignored are
movie_imdb_link
plot_keywords
movie_title
genres


In [75]:
df=df.drop("genres",1)
df=df.drop("movie_imdb_link",1)
df=df.drop("movie_title",1)
df=df.drop("plot_keywords",1)
df=df.drop("plotkeyword_others",1)
df.aspect_ratio=df.aspect_ratio.astype(str)

Converting the target rating into bins
This is done because the ratings in the data vary by .1 but this does not necessarily have a huge impact with the audiance

For example, a movie with 9.1 rating is as good as a movie with 9.2 rating.So, Ideally we will have to convert imdb rating into fewer bins

Since linear model accepts target of type boolean (n_classes= 2). we have converted imdb score into a column which indicates if its greater than 5

Any movie with rating greater than 5 is indicated with 1
Less than 5 is indicated with 0


In [76]:
df.imdb_score=pd.cut(df['imdb_score'],[0,5,10],labels=False)


Split the data into train/test subsets. 60 train 40 test

In [77]:
x_train = df.sample(frac=.6, random_state=1232)
x_test = df.drop(x_train.index)

x_train.to_csv("C:\\Users\\Pavan\\desktop\\rang\\tf2\\x_train.csv", sep=',', encoding='utf-8',header=False,na_rep='',index=False)
x_test.to_csv("C:\\Users\\Pavan\\desktop\\rang\\tf2\\x_test.csv", sep=',', encoding='utf-8',header=False,na_rep='',index=False)


In [78]:
_CSV_COLUMNS =df.columns.values.tolist()

_CSV_COLUMN_DEFAULTS=[]

In [79]:
for column in df.dtypes:
   if(column=="int64"):
        _CSV_COLUMN_DEFAULTS.append([0])
   elif(column=="float64"):
        _CSV_COLUMN_DEFAULTS.append([0.00])
   else:
        _CSV_COLUMN_DEFAULTS.append([''])


In [80]:
#Quality Check. The value of below code must be true 
len(_CSV_COLUMNS)==len(_CSV_COLUMN_DEFAULTS)

True

Declaing the features of the model 

In [81]:
genres_count = tf.feature_column.numeric_column ('genres_count')
plotkeywords_count = tf.feature_column.numeric_column ('plotkeywords_count')
color=tf.feature_column.categorical_column_with_vocabulary_list('color', ['Color' ' Black and White' ''])
language=tf.feature_column.categorical_column_with_vocabulary_list('language',['English' 'Cantonese' 'Romanian' 'Spanish' 'Japanese' 'Mandarin' 'Norwegian' 'Portuguese' 'Indonesian' 'French' 'Dutch' 'Polish' 'Danish' 'Hindi' '' 'German' 'Hebrew' 'Dari' 'Hungarian' 'None' 'Arabic' 'Swedish' 'Korean' 'Zulu' 'Chinese' 'Persian' 'Italian' 'Aboriginal' 'Vietnamese' 'Telugu' 'Thai' 'Filipino' 'Mongolian' 'Aramaic' 'Kazakh' 'Maya' 'Czech' 'Dzongkha' 'Russian' 'Icelandic' 'Bosnian' 'Greek'] )
country=tf.feature_column.categorical_column_with_vocabulary_list('country', ['USA' 'Germany' 'Hong Kong' 'UK' 'Romania' 'France' 'Mexico' 'Japan' 'China' 'Philippines' 'Norway' 'Brazil' 'Australia' 'New Zealand' 'Indonesia' 'Netherlands' 'Poland' 'Canada' 'Spain' 'Denmark' 'India' 'Argentina' 'Russia' 'Ireland' 'Czech Republic' 'Israel' 'Hungary' 'South Korea' 'South Africa' 'Sweden' 'Official site' 'Italy' 'Taiwan' 'Cameroon' 'Thailand' 'Georgia' 'Iran' 'New Line' 'Aruba' 'Afghanistan' 'Belgium' 'Iceland' 'Greece' 'Chile' 'Peru' 'West Germany' 'Colombia' 'Finland'])
content_rating=tf.feature_column.categorical_column_with_vocabulary_list('content_rating',['PG-13' 'R' 'PG' 'G' 'Not Rated' 'Approved' 'Unrated' '' 'TV-MA' 'Passed' 'X' 'M' 'GP' 'NC-17'] )
aspect_ratio = tf.feature_column.categorical_column_with_vocabulary_list('aspect_ratio',['2.35','1.85','2.39','1.66','','1.77','1.33','2.','2.76','1.78','1.37','2.2','1.75','2.55','2.4','2.24','1.5','16.','1.44','1.18'])

director_name = tf.feature_column.categorical_column_with_hash_bucket('director_name', hash_bucket_size=5000)
actor_2_name = tf.feature_column.categorical_column_with_hash_bucket('actor_2_name', hash_bucket_size=5000)
actor_1_name = tf.feature_column.categorical_column_with_hash_bucket('actor_1_name', hash_bucket_size=5000)
actor_3_name = tf.feature_column.categorical_column_with_hash_bucket('actor_3_name', hash_bucket_size=5000)
genres_array = tf.feature_column.categorical_column_with_hash_bucket('genres_array', hash_bucket_size=len(set(functools.reduce(list.__add__, df.genres_array))))
plotkeyword_1 = tf.feature_column.categorical_column_with_hash_bucket('plotkeyword_1', hash_bucket_size=len(set(df.plotkeyword_1)))
movie_first_alpha_numeric = tf.feature_column.categorical_column_with_hash_bucket('movie_first_alpha_numeric', hash_bucket_size=len(set(df.movie_first_alpha_numeric)))
num_critic_for_reviews = tf.feature_column.numeric_column ('num_critic_for_reviews')
duration = tf.feature_column.numeric_column ('duration')
director_facebook_likes = tf.feature_column.numeric_column ('director_facebook_likes')
actor_3_facebook_likes = tf.feature_column.numeric_column ('actor_3_facebook_likes')
actor_1_facebook_likes = tf.feature_column.numeric_column ('actor_1_facebook_likes')
gross = tf.feature_column.numeric_column ('gross')
num_voted_users = tf.feature_column.numeric_column ('num_voted_users')
cast_total_facebook_likes = tf.feature_column.numeric_column ('cast_total_facebook_likes')
facenumber_in_poster = tf.feature_column.numeric_column ('facenumber_in_poster')
num_user_for_reviews = tf.feature_column.numeric_column ('num_user_for_reviews')
budget = tf.feature_column.numeric_column ('budget')
title_year = tf.feature_column.numeric_column ('title_year')
actor_2_facebook_likes = tf.feature_column.numeric_column ('actor_2_facebook_likes')
movie_facebook_likes = tf.feature_column.numeric_column ('movie_facebook_likes')
movie_title_word_length = tf.feature_column.numeric_column ('movie_title_word_length')


#Creating tensor flow buckets based on the numeric columns above

In [82]:
num_critic_for_reviews_buckets = tf.feature_column.bucketized_column(num_critic_for_reviews,boundaries=[35.0, 60.0, 81.0, 105.0, 130.0, 159.0, 193.0, 241.0, 323.0, 813.0])
duration_buckets = tf.feature_column.bucketized_column(duration,boundaries=[88.0, 93.0, 97.0, 101.0, 105.0, 110.0, 116.0, 123.0, 135.0, 334.0])
director_facebook_likes_buckets = tf.feature_column.bucketized_column(director_facebook_likes,boundaries=[4.0, 15.0, 32.0, 57.0, 98.0, 165.0, 301.0, 571.0, 23000.0])
actor_3_facebook_likes_buckets = tf.feature_column.bucketized_column(actor_3_facebook_likes,boundaries=[48.0, 122.0, 218.0, 311.0, 416.0, 511.0, 612.0, 745.0, 924.5, 23000.0])
actor_1_facebook_likes_buckets = tf.feature_column.bucketized_column(actor_1_facebook_likes,boundaries=[353.0, 600.8, 794.2, 939.0, 1000.0, 3000.0, 11000.0, 14000.0, 20000.0, 640000.0])
gross_buckets = tf.feature_column.bucketized_column(gross,boundaries=[382601.8, 3023265.0, 8131359.8, 15802119.0, 25517500.0, 36565306.4, 51798114.8, 75601728.0, 125025163.2, 760505847.0])
num_voted_users_buckets = tf.feature_column.bucketized_column(num_voted_users,boundaries=[4553.4, 11232.6, 20003.2, 31266.6, 46396.0, 66462.8, 93759.4, 145332.6, 247440.2, 1689764.0])
cast_total_facebook_likes_buckets = tf.feature_column.bucketized_column(cast_total_facebook_likes,boundaries=[725.8, 1384.2, 2049.6, 2731.0, 3697.0, 5708.8, 13332.2, 18452.8, 28099.6, 656730.0])
facenumber_in_poster_buckets = tf.feature_column.bucketized_column(facenumber_in_poster,boundaries=[1.0, 2.0, 4.0, 43.0])
num_user_for_reviews_buckets = tf.feature_column.bucketized_column(num_user_for_reviews,boundaries=[40.0, 77.0, 111.0, 147.0, 190.0, 247.0, 320.9, 442.6, 687.0, 5060.0])
budget_buckets = tf.feature_column.bucketized_column(budget,boundaries=[2600000.0, 7000000.0, 12000000.0, 17000000.0, 24000000.0, 30000000.0, 40000000.0, 60000000.0, 93000000.0, 12215500000.0])
title_year_buckets = tf.feature_column.bucketized_column(title_year,boundaries=[1993.0, 1998.0, 2000.0, 2003.0, 2005.0, 2007.0, 2009.0, 2011.0, 2013.0, 2016.0])
actor_2_facebook_likes_buckets = tf.feature_column.bucketized_column(actor_2_facebook_likes,boundaries=[114.2, 269.0, 417.6, 542.8, 650.0, 793.2, 899.0, 1000.0, 5000.0, 137000.0])
movie_facebook_likes_buckets = tf.feature_column.bucketized_column(movie_facebook_likes,boundaries=[182.0, 613.8, 1000.0, 14000.0, 27200.0, 349000.0])
genres_count_buckets = tf.feature_column.bucketized_column(genres_count,boundaries=[2.0, 3.0, 4.0, 8.0])
plotkeywords_count_buckets = tf.feature_column.bucketized_column(plotkeywords_count,boundaries=[51.0, 55.0, 59.0, 62.0, 66.0, 70.0, 74.6, 80.0, 90.0, 165.0])
movie_title_word_length_buckets = tf.feature_column.bucketized_column(movie_title_word_length,boundaries=[2.0, 3.0, 4.0, 5.0, 13.0])


Defining the Input function

In [83]:
def input_fn(data_file, num_epochs, shuffle, batch_size):
  """Generate an input function for the Estimator."""
  assert tf.gfile.Exists(data_file), (
      '%s not found. Please make sure you have either run data_download.py or '
      'set both arguments --train_data and --test_data.' % data_file)
  def parse_csv(data_file):
    print('Parsing', data_file)
    columns = tf.decode_csv(data_file, record_defaults=_CSV_COLUMN_DEFAULTS)
    features = dict(zip(_CSV_COLUMNS, columns))
    labels = features.pop('imdb_score')
    return features,tf.equal(labels, 1)
  # Extract lines from input files using the Dataset API.
  
  dataset = tf.data.TextLineDataset(data_file)
  dataset = dataset.map(parse_csv, num_parallel_calls=5)
  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)
  iterator = dataset.make_one_shot_iterator()
  features, labels = iterator.get_next()
  return features, labels


In [84]:
# Defining the base columns
base_columns = [genres_count_buckets,plotkeywords_count_buckets,plotkeyword_1,color,language,country,content_rating,director_name,actor_2_name,actor_1_name,actor_3_name,num_critic_for_reviews_buckets,duration_buckets,director_facebook_likes_buckets,actor_3_facebook_likes_buckets,actor_2_facebook_likes_buckets,actor_1_facebook_likes_buckets,num_voted_users_buckets,facenumber_in_poster_buckets,budget_buckets,title_year_buckets,gross_buckets,aspect_ratio]
#note user reviews and movie facebook likes and cast facebook likes are not considered for analysis

In [85]:
#defining the linear model
model_dir = "tmp\\modelL2"

model = tf.estimator.LinearClassifier(model_dir=model_dir, feature_columns=base_columns,optimizer=tf.train.FtrlOptimizer(learning_rate=0.1,l1_regularization_strength=1.0,l2_regularization_strength=1.0))


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'tmp\\modelL2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002951B6F0D68>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [86]:
#training the model 
model.train(input_fn=lambda: input_fn("C:\\Users\\Pavan\\desktop\\rang\\tf2\\x_train.csv", 40, True,10))


Parsing Tensor("arg0:0", shape=(), dtype=string)
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from tmp\modelL2\model.ckpt-12104
INFO:tensorflow:Saving checkpoints for 12105 into tmp\modelL2\model.ckpt.
INFO:tensorflow:loss = 0.488129, step = 12105
INFO:tensorflow:global_step/sec: 50.0397
INFO:tensorflow:loss = 0.625223, step = 12205 (1.999 sec)
INFO:tensorflow:global_step/sec: 244.325
INFO:tensorflow:loss = 1.70006, step = 12305 (0.408 sec)
INFO:tensorflow:global_step/sec: 263.665
INFO:tensorflow:loss = 1.58838, step = 12405 (0.379 sec)
INFO:tensorflow:global_step/sec: 277.581
INFO:tensorflow:loss = 0.771451, step = 12505 (0.360 sec)
INFO:tensorflow:global_step/sec: 323.394
INFO:tensorflow:loss = 0.706963, step = 12605 (0.310 sec)
INFO:tensorflow:global_step/sec: 310.339
INFO:tensorflow:loss = 0.176705, step = 12705 (0.321 sec)
INFO:tensorflow:global_step/sec: 311.305
INFO:tensorflow:loss = 0.272893, step = 12805 (0.320 sec)
INFO:tensorflow:global_st

INFO:tensorflow:loss = 0.280614, step = 20205 (0.321 sec)
INFO:tensorflow:global_step/sec: 289.649
INFO:tensorflow:loss = 0.466659, step = 20305 (0.345 sec)
INFO:tensorflow:global_step/sec: 299.189
INFO:tensorflow:loss = 0.811704, step = 20405 (0.335 sec)
INFO:tensorflow:global_step/sec: 290.492
INFO:tensorflow:loss = 0.857785, step = 20505 (0.343 sec)
INFO:tensorflow:global_step/sec: 311.304
INFO:tensorflow:loss = 0.508547, step = 20605 (0.322 sec)
INFO:tensorflow:global_step/sec: 288.813
INFO:tensorflow:loss = 0.143761, step = 20705 (0.345 sec)
INFO:tensorflow:global_step/sec: 305.593
INFO:tensorflow:loss = 0.721685, step = 20805 (0.328 sec)
INFO:tensorflow:global_step/sec: 301.9
INFO:tensorflow:loss = 0.459039, step = 20905 (0.330 sec)
INFO:tensorflow:global_step/sec: 296.525
INFO:tensorflow:loss = 0.36966, step = 21005 (0.337 sec)
INFO:tensorflow:global_step/sec: 319.262
INFO:tensorflow:loss = 0.396955, step = 21105 (0.313 sec)
INFO:tensorflow:global_step/sec: 311.304
INFO:tensorfl

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x2951b6f0c50>

In [87]:
#Evaluating the model
results = model.evaluate(input_fn=lambda: input_fn("C:\\Users\\Pavan\\desktop\\rang\\tf2\\x_test.csv", 1, False, 10))


Parsing Tensor("arg0:0", shape=(), dtype=string)
INFO:tensorflow:Starting evaluation at 2018-02-06-16:37:53
INFO:tensorflow:Restoring parameters from tmp\modelL2\model.ckpt-24208
INFO:tensorflow:Finished evaluation at 2018-02-06-16:38:04
INFO:tensorflow:Saving dict for global step 24208: accuracy = 0.950917, accuracy_baseline = 0.895885, auc = 0.909848, auc_precision_recall = 0.984473, average_loss = 0.16652, global_step = 24208, label/mean = 0.895885, loss = 1.66273, prediction/mean = 0.900645


In [88]:
for key in sorted(results):
  print('%s: %s' % (key, results[key]))


accuracy: 0.950917
accuracy_baseline: 0.895885
auc: 0.909848
auc_precision_recall: 0.984473
average_loss: 0.16652
global_step: 24208
label/mean: 0.895885
loss: 1.66273
prediction/mean: 0.900645


In [89]:
#To check predicted results 
#predict_results = model.predict(input_fn=lambda: input_fn("C:\\Users\\Pavan\\desktop\\rang\\tf2\\x_test.csv", 1, True,1))