In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np
from ast import literal_eval

#Libraries for text preprocessing
import string 
import nltk 
from nltk.stem import WordNetLemmatizer

In [47]:
#Helper functions
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def empty_cells_in_column(df):
  for cols in df:
    x = df[cols].isnull().value_counts()
    print(x)

def remove_punctuation(text):
  punctuationfree="".join([i for i in text if i not in string.punctuation])
  return punctuationfree

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text

def vect(x):
  l = len(x)
  v = np.zeros((l,100))
  for i in range(l):
    try:
      v[i] = embeddings_index[x[i]]
    except:
      v[i] = np.zeros((1,100))
  v = 1/l * np.sum(v, axis=0)
  return v

def replace_Word(x):
  for i in range(len(x)):
    if x[i]=="sciencefiction":
      x[i]="science"
    if x[i]=="tvmovie":
      x[i]="television"
  return x
  
def split_name(x):
  l=[]
  for i in x:
    k = i.split(" ")
    for j in k:
      l.append(j)
  return l

In [3]:
#Data file paths
METADATA_PATH = '/content/drive/MyDrive/Colab Notebooks/ee541/Project/Data/movies_metadata.csv'
CREDITS_PATH = '/content/drive/MyDrive/Colab Notebooks/ee541/Project/Data/credits.csv'

In [4]:
metadata_df = pd.read_csv(METADATA_PATH)
credits_df = pd.read_csv(CREDITS_PATH)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
#Removing irrelevant data features
metadata_df = metadata_df.drop(['homepage','imdb_id', 'tagline', 'video', 'title', 'spoken_languages','production_countries','poster_path','production_companies'], axis=1)

In [6]:
#Keeping only true false values
metadata_df = metadata_df.loc[metadata_df.adult.isin(['True','False']), :]

In [7]:
#Merging the credits csv on the metadata df
metadata_df['id'] = metadata_df['id'].astype('int')
credits_df['id'] = credits_df['id'].astype('int')
metadata_df = metadata_df.merge(credits_df, on= 'id')

In [8]:
#Converting the cast to readable code
metadata_df['crew'] = metadata_df['crew'].apply(literal_eval)
metadata_df['cast'] = metadata_df['cast'].apply(literal_eval)
metadata_df['genres'] = metadata_df['genres'].apply(literal_eval)

In [9]:
#Extracting the director,cast, genres from the credit.csv 
metadata_df['director'] = metadata_df['crew'].apply(get_director)
metadata_df['cast'] = metadata_df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
metadata_df['cast'] = metadata_df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
metadata_df['genres'] = metadata_df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [10]:
#Filling empty budget cells with avg value
metadata_df['budget'] = metadata_df['budget'].astype('int')
mean_budget = metadata_df['budget'].mean()
metadata_df['budget'] = metadata_df['budget'].replace( 0, mean_budget)

In [11]:
#Filling empty revenue cells with avg value
metadata_df['revenue'] = metadata_df['revenue'].replace( np.NaN, 0)
metadata_df['revenue'] = metadata_df['revenue'].astype('int')
mean_revenue = metadata_df['revenue'].mean()
metadata_df['revenue'] = metadata_df['revenue'].replace( 0, mean_revenue)

In [12]:
#Adding 0 to all empty cells
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].replace(np.NaN, "{'id':0}")
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].apply(literal_eval)
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].apply(lambda x: x['id'] if isinstance(x, dict) else [])
#Setting the value to random numbers
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].astype('int')

In [13]:
#Filtering to only Released movies
metadata_df = metadata_df[metadata_df.status == 'Released']

In [14]:
#Filtering to only english movies
metadata_df = metadata_df[metadata_df.original_language == 'en']

In [15]:
#Dropping the rows with null values
metadata_df = metadata_df.dropna(subset=['overview','release_date', 'runtime','director']) # dropped 1 example from runtime, 1 from overview, 11 from language, 87 from date

In [16]:
#Converting release date to year
metadata_df['year']= pd.to_datetime(metadata_df['release_date'], format = '%Y-%m-%d').dt.year
metadata_df['year'] = metadata_df['year'].astype('int')

In [17]:
#Removing crew and status column after extracting the parameters needed
metadata_df = metadata_df.drop(['crew','status','original_language','release_date'], axis = 1)

In [18]:
#Creating a weighted rating feature
C = metadata_df['vote_average'].mean()
m = 10
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)
metadata_df['wr'] = metadata_df.apply(weighted_rating, axis=1)

In [19]:
metadata_df = metadata_df.drop(['vote_average','vote_count'], axis = 1)

In [20]:
metadata_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_title,overview,popularity,revenue,runtime,cast,director,year,wr
0,False,10194,30000000.0,"[Animation, Comedy, Family]",862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,373554000.0,81.0,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,1995,7.696017
1,False,0,65000000.0,"[Adventure, Fantasy, Family]",8844,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,262797200.0,104.0,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,1995,6.894383
2,False,119050,4223284.0,"[Romance, Comedy]",15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,11197860.0,101.0,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,1995,6.405793
3,False,0,16000000.0,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,81452160.0,127.0,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,1995,5.97252
4,False,96871,4223284.0,[Comedy],11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,76578910.0,106.0,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,1995,5.691207


Text Preprocessing

In [21]:
#Converting true/false to 1/0
metadata_df['adult'] = metadata_df['adult'].apply(lambda x : 0 if x=="False" else 1)

In [22]:
#Converting all text to lowercase
metadata_df['cast'] = metadata_df['cast'].apply(lambda x: [str.lower(i) for i in x])
metadata_df['director'] = metadata_df['director'].apply(lambda x: [str.lower(x)])
metadata_df['genres'] = metadata_df['genres'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
metadata_df['overview']= metadata_df['overview'].apply(lambda x: x.lower())

In [23]:
#Removing punctuations and creating tokens for overview column
metadata_df['overview']= metadata_df['overview'].apply(lambda x:remove_punctuation(x))
metadata_df['overview']= metadata_df['overview'].apply(lambda x: list(x.split(" ")))

In [24]:
#Downloading stopwords and lemmetizers
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [25]:
#Removing stopwords
stopwords = nltk.corpus.stopwords.words('english')
metadata_df['overview']= metadata_df['overview'].apply(lambda x:remove_stopwords(x))

In [26]:
#Applying Lemmatization
metadata_df['overview']=metadata_df['overview'].apply(lambda x:lemmatizer(x))

In [27]:
metadata_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_title,overview,popularity,revenue,runtime,cast,director,year,wr
0,0,10194,30000000.0,"[animation, comedy, family]",862,Toy Story,"[led, woody, andys, toy, live, happily, room, ...",21.946943,373554000.0,81.0,"[tom hanks, tim allen, don rickles]",[john lasseter],1995,7.696017
1,0,0,65000000.0,"[adventure, fantasy, family]",8844,Jumanji,"[sibling, judy, peter, discover, enchanted, bo...",17.015539,262797200.0,104.0,"[robin williams, jonathan hyde, kirsten dunst]",[joe johnston],1995,6.894383
2,0,119050,4223284.0,"[romance, comedy]",15602,Grumpier Old Men,"[family, wedding, reignites, ancient, feud, ne...",11.7129,11197860.0,101.0,"[walter matthau, jack lemmon, ann-margret]",[howard deutch],1995,6.405793
3,0,0,16000000.0,"[comedy, drama, romance]",31357,Waiting to Exhale,"[cheated, mistreated, stepped, woman, holding,...",3.859495,81452160.0,127.0,"[whitney houston, angela bassett, loretta devine]",[forest whitaker],1995,5.97252
4,0,96871,4223284.0,[comedy],11862,Father of the Bride Part II,"[george, bank, recovered, daughter, wedding, r...",8.387519,76578910.0,106.0,"[steve martin, diane keaton, martin short]",[charles shyer],1995,5.691207


In [28]:
metadata_df['genres'] = metadata_df['genres'].apply(lambda x: np.NaN if len(x)==0 else x)
metadata_df['overview'] = metadata_df['overview'].apply(lambda x: np.NaN if len(x)==0 else x)
metadata_df['cast'] = metadata_df['cast'].apply(lambda x: np.NaN if len(x)==0 else x)
metadata_df['director'] = metadata_df['director'].apply(lambda x: np.NaN if len(x)==0 else x)

In [29]:
metadata_df.shape

(31218, 14)

In [30]:
metadata_df = metadata_df.dropna(subset = ['genres','overview','cast','director'])

In [31]:
metadata_df.shape

(28725, 14)

In [34]:
metadata_df = metadata_df.drop_duplicates(['id'])

In [35]:
metadata_df.shape

(28670, 14)

In [54]:
df = metadata_df.copy()

In [55]:
df = df[(df.year > 2006)]

In [56]:
df = df[(df.runtime > 75)]

In [57]:
df['popularity']=df['popularity'].astype(float)
df = df[(df.popularity > 1)]

In [58]:
df.shape

(6716, 14)

In [59]:
df.to_csv("cleaned_data.csv")

In [60]:
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_title,overview,popularity,revenue,runtime,cast,director,year,wr
11194,0,0,40000000.0,"[adventure, drama]",77221,Black Gold,"[arabian, peninsula, 1930s, two, warring, lead...",6.652197,5446000.0,130.0,"[mark strong, antonio banderas, freida pinto]",[jean-jacques annaud],2011,5.858516
11517,0,0,21000000.0,"[crime, drama]",1646,Freedom Writers,"[young, teacher, inspires, class, atrisk, stud...",13.585233,41170784.0,123.0,"[hilary swank, scott glenn, imelda staunton]",[richard lagravenese],2007,7.545479
11542,0,0,20000000.0,"[action, comedy, crime]",14396,Code Name: The Cleaner,"[cedric, entertainer, play, jake, seemingly, r...",11.107148,10337477.0,84.0,"[cedric the entertainer, lucy liu, nicollette ...",[les mayfield],2007,4.795351
11543,0,160727,13000000.0,"[drama, music]",1931,Stomp the Yard,"[death, younger, brother, troubled, 19yearold,...",5.607508,75511123.0,114.0,"[columbus short, meagan good, ne-yo]",[sylvain white],2007,6.043909
11564,0,0,4223284.0,[adventure],4283,Primeval,"[news, team, sent, burundi, capture, bring, ho...",2.454709,10597734.0,93.0,"[dominic purcell, brooke langton, orlando jones]",[michael katleman],2007,4.852562


In [None]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

Converting text to Vectors using GLoVe

In [43]:
path_to_glove_file = '/content/drive/MyDrive/Colab Notebooks/ee541/Project/Data/glove.6B.100d.txt'
embeddings_index = {}

with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [61]:
df['overview'] = df['overview'].apply(lambda x: vect(x))

In [62]:
df['genres'] = df['genres'].apply(lambda x: replace_Word(x))
df['genres'] = df['genres'].apply(lambda x: vect(x))

In [63]:
df['director']=df['director'].apply(lambda x:split_name(x))

In [64]:
df['director'] = df['director'].apply(lambda x: vect(x))

In [65]:
df['cast'] = df['cast'].apply(lambda x:split_name(x))
df['cast']= df['cast'].apply(lambda x: vect(x))

In [66]:
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_title,overview,popularity,revenue,runtime,cast,director,year,wr
11194,0,0,40000000.0,"[0.05125001072883606, 0.1583399996161461, 0.35...",77221,Black Gold,"[-0.010660814345241167, 0.1596413238439709, 0....",6.652197,5446000.0,130.0,"[0.2850535021473964, -0.08414500206708908, -0....","[0.3196450024843216, -0.10356049798429012, -0....",2011,5.858516
11517,0,0,21000000.0,"[0.38124001026153564, -0.04122999310493469, 0....",1646,Freedom Writers,"[0.3287661432155541, 0.4203878555979047, -0.11...",13.585233,41170784.0,123.0,"[-0.0871676579117775, 0.015092335641384125, 0....","[-0.16906149685382843, -0.024535000324249268, ...",2007,7.545479
11542,0,0,20000000.0,"[0.25511667629082996, -0.1865606630841891, 0.1...",14396,Code Name: The Cleaner,"[0.1028693187981844, -0.17349857176569375, 0.3...",11.107148,10337477.0,84.0,"[0.027088857655014308, -0.2822154281394822, 0....","[0.4210199937224388, -0.4264250099658966, -0.1...",2007,4.795351
11543,0,160727,13000000.0,"[0.271525003015995, 0.27035000920295715, 0.263...",1931,Stomp the Yard,"[0.1287028950587329, 0.17493366599082946, 0.11...",5.607508,75511123.0,114.0,"[-0.006006798520684243, -0.18687740452587606, ...","[0.2157049924135208, 0.003040000796318054, 0.2...",2007,6.043909
11564,0,0,4223284.0,"[-0.31349998712539673, 0.14850999414920807, 0....",4283,Primeval,"[-0.22376884010413453, 0.0019558901484641763, ...",2.454709,10597734.0,93.0,"[0.03160366974771023, -0.1400711651270588, -0....","[0.148485004901886, 0.06669499725103378, 0.095...",2007,4.852562


Normalising the data

In [67]:
normalized_df = df.copy()

In [68]:
m = max(normalized_df['belongs_to_collection'])
normalized_df['belongs_to_collection'] = normalized_df['belongs_to_collection'].apply(lambda x: x/m)

In [69]:
m = max(normalized_df['budget'])
normalized_df['budget'] = normalized_df['budget'].apply(lambda x: x/m)

In [70]:
normalized_df['popularity']=normalized_df['popularity'].astype(float)
m = max(normalized_df['popularity'])
normalized_df['popularity'] = normalized_df['popularity'].apply(lambda x: x/m)

In [71]:
m = max(normalized_df['revenue'])
normalized_df['revenue'] = normalized_df['revenue'].apply(lambda x: x/m)

In [72]:
m = max(normalized_df['runtime'])
normalized_df['runtime'] = normalized_df['runtime'].apply(lambda x: x/m)

In [73]:
m = max(normalized_df['year'])
normalized_df['year'] = normalized_df['year'].apply(lambda x: x/m)

In [74]:
m = max(normalized_df['wr'])
normalized_df['wr'] = normalized_df['wr'].apply(lambda x: x/m)

In [75]:
normalized_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,original_title,overview,popularity,revenue,runtime,cast,director,year,wr
11194,0,0.0,0.105263,"[0.05125001072883606, 0.1583399996161461, 0.35...",77221,Black Gold,"[-0.010660814345241167, 0.1596413238439709, 0....",0.01215,0.001953,0.144444,"[0.2850535021473964, -0.08414500206708908, -0....","[0.3196450024843216, -0.10356049798429012, -0....",0.997025,0.706037
11517,0,0.0,0.055263,"[0.38124001026153564, -0.04122999310493469, 0....",1646,Freedom Writers,"[0.3287661432155541, 0.4203878555979047, -0.11...",0.024814,0.014767,0.136667,"[-0.0871676579117775, 0.015092335641384125, 0....","[-0.16906149685382843, -0.024535000324249268, ...",0.995042,0.90934
11542,0,0.0,0.052632,"[0.25511667629082996, -0.1865606630841891, 0.1...",14396,Code Name: The Cleaner,"[0.1028693187981844, -0.17349857176569375, 0.3...",0.020287,0.003708,0.093333,"[0.027088857655014308, -0.2822154281394822, 0....","[0.4210199937224388, -0.4264250099658966, -0.1...",0.995042,0.57791
11543,0,0.334736,0.034211,"[0.271525003015995, 0.27035000920295715, 0.263...",1931,Stomp the Yard,"[0.1287028950587329, 0.17493366599082946, 0.11...",0.010242,0.027085,0.126667,"[-0.006006798520684243, -0.18687740452587606, ...","[0.2157049924135208, 0.003040000796318054, 0.2...",0.995042,0.728379
11564,0,0.0,0.011114,"[-0.31349998712539673, 0.14850999414920807, 0....",4283,Primeval,"[-0.22376884010413453, 0.0019558901484641763, ...",0.004484,0.003801,0.103333,"[0.03160366974771023, -0.1400711651270588, -0....","[0.148485004901886, 0.06669499725103378, 0.095...",0.995042,0.584804


In [76]:
normalized_df.shape

(6716, 14)

In [77]:
id = normalized_df['id']
title = normalized_df['original_title']
id = id.to_numpy()
title = title.to_numpy()
normalized_df = normalized_df.drop(['id','original_title'], axis=1)

In [78]:
X = normalized_df.to_numpy()

In [79]:
X.shape, X[0].shape

((6716, 12), (12,))

In [80]:
K = np.zeros((6716,409))
for i in range(6716):
  K[i][0] = X[i][0]
  K[i][1] = X[i][1]
  K[i][2] = X[i][2]

  n = X[i][3].flatten()
  for j in range(100):
    K[i][3+j] = n[j]
  
  n = X[i][4].flatten()
  for j in range(100):
    K[i][103+j] = n[j]

  K[i][203] = X[i][5]
  K[i][204] = X[i][6]
  K[i][205] = X[i][7]

  n = X[i][8].flatten()
  for j in range(100):
    K[i][206+j] = n[j]
  
  n = X[i][9].flatten()
  for j in range(100):
    K[i][306+j] = n[j]
  
  K[i][407] = X[i][10]
  K[i][408] = X[i][11]

In [82]:
import h5py
with h5py.File("movies_data.hd5", 'w') as hf: 
  hf.create_dataset('metadata', data=K)
  hf.create_dataset('id', data=id)
  hf.create_dataset('title', data=title)