In [24]:
#Import necessary libraries
import pandas as pd
import numpy as np
from ast import literal_eval

#Libraries for text preprocessing
import string 
import nltk 
from nltk.stem import WordNetLemmatizer

In [41]:
#Helper functions
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def empty_cells_in_column(df):
  for cols in df:
    x = df[cols].isnull().value_counts()
    print(x)

def remove_punctuation(text):
  punctuationfree="".join([i for i in text if i not in string.punctuation])
  return punctuationfree

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text

In [3]:
#Data file paths
METADATA_PATH = '/content/drive/MyDrive/Colab Notebooks/ee541/Project/Data/movies_metadata.csv'
CREDITS_PATH = '/content/drive/MyDrive/Colab Notebooks/ee541/Project/Data/credits.csv'

In [4]:
metadata_df = pd.read_csv(METADATA_PATH)
credits_df = pd.read_csv(CREDITS_PATH)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
#Removing irrelevant data features
metadata_df = metadata_df.drop(['homepage','imdb_id', 'original_title', 'tagline', 'video', 'title', 'spoken_languages','production_countries','poster_path','production_companies'], axis=1)

In [6]:
#Keeping only true false values
metadata_df = metadata_df.loc[metadata_df.adult.isin(['True','False']), :]

In [7]:
##Merging the credits csv on the metadata df
metadata_df['id'] = metadata_df['id'].astype('int')
credits_df['id'] = credits_df['id'].astype('int')
metadata_df = metadata_df.merge(credits_df, on= 'id')

In [8]:
#Converting the cast to readable code
metadata_df['crew'] = metadata_df['crew'].apply(literal_eval)
metadata_df['cast'] = metadata_df['cast'].apply(literal_eval)
metadata_df['genres'] = metadata_df['genres'].apply(literal_eval)

In [9]:
#Extracting the director,cast, genres from the credit.csv 
metadata_df['director'] = metadata_df['crew'].apply(get_director)
metadata_df['cast'] = metadata_df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
metadata_df['cast'] = metadata_df['cast'].apply(lambda x: x[:5] if len(x) >=5 else x)
metadata_df['genres'] = metadata_df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [10]:
#Filling empty budget cells with avg value
metadata_df['budget'] = metadata_df['budget'].astype('int')
mean_budget = metadata_df['budget'].mean()
metadata_df['budget'] = metadata_df['budget'].replace( 0, mean_budget)

In [11]:
#Filling empty revenue cells with avg value
metadata_df['revenue'] = metadata_df['revenue'].replace( np.NaN, 0)
metadata_df['revenue'] = metadata_df['revenue'].astype('int')
mean_revenue = metadata_df['revenue'].mean()
metadata_df['revenue'] = metadata_df['revenue'].replace( 0, mean_revenue)

In [12]:
#Adding 0 to all empty cells
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].replace(np.NaN, "{'id':0}")
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].apply(literal_eval)
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].apply(lambda x: x['id'] if isinstance(x, dict) else [])

In [13]:
#Setting the value to random numbers
collection_set = set(metadata_df['belongs_to_collection'].unique().tolist())
metadata_df['belongs_to_collection'].astype('int')
iter_list = iter(list(set(range(0, 480160))-collection_set))
metadata_df['belongs_to_collection'] = metadata_df['belongs_to_collection'].apply(lambda x: next(iter_list) if x == 0 else x)

In [14]:
#Filtering to only Released movies
metadata_df = metadata_df[metadata_df.status == 'Released']

In [15]:
#Filtering to only english movies
metadata_df = metadata_df[metadata_df.original_language == 'en']

In [16]:
#Dropping the rows with null values
metadata_df = metadata_df.dropna(subset=['overview','release_date', 'runtime','director']) # dropped 1 example from runtime, 1 from overview, 11 from language, 87 from date

In [17]:
#Converting release date to year
metadata_df['Year']= pd.to_datetime(metadata_df['release_date'], format = '%Y-%m-%d').dt.year
metadata_df['Year'] = metadata_df['Year'].astype('int')

In [18]:
#Removing crew and status column after extracting the parameters needed
metadata_df = metadata_df.drop(['crew','status','original_language','release_date'], axis = 1)

In [42]:
#Creating a weighted rating feature
C = metadata_df['vote_average'].mean()
m = 10
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)
metadata_df['wr'] = metadata_df.apply(weighted_rating, axis=1)

In [44]:
metadata_df = metadata_df.drop(['vote_average','vote_count'], axis = 1)

Text Preprocessing

In [19]:
#Converting true/false to 1/0
metadata_df['adult'] = metadata_df['adult'].apply(lambda x : 0 if x=="False" else 1)

In [20]:
#Converting all text to lowercase
metadata_df['cast'] = metadata_df['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
metadata_df['director'] = metadata_df['director'].apply(lambda x: [str.lower(x.replace(" ", ""))])
metadata_df['genres'] = metadata_df['genres'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
metadata_df['overview']= metadata_df['overview'].apply(lambda x: x.lower())

In [22]:
#Removing punctuations and creating tokens for overview column
metadata_df['overview']= metadata_df['overview'].apply(lambda x:remove_punctuation(x))
metadata_df['overview']= metadata_df['overview'].apply(lambda x: list(x.split(" ")))

In [25]:
#Downloading stopwords and lemmetizers
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [30]:
#Removing stopwords
stopwords = nltk.corpus.stopwords.words('english')
metadata_df['overview']= metadata_df['overview'].apply(lambda x:remove_stopwords(x))

In [31]:
#Applying Lemmatization
metadata_df['overview']=metadata_df['overview'].apply(lambda x:lemmatizer(x))

In [46]:
metadata_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,id,overview,popularity,revenue,runtime,cast,director,Year,wr
0,0,10194,30000000.0,"[animation, comedy, family]",862,"[led, woody, andys, toy, live, happily, room, ...",21.946943,373554000.0,81.0,"[tomhanks, timallen, donrickles, jimvarney, wa...",[johnlasseter],1995,7.696017
1,0,1,65000000.0,"[adventure, fantasy, family]",8844,"[sibling, judy, peter, discover, enchanted, bo...",17.015539,262797200.0,104.0,"[robinwilliams, jonathanhyde, kirstendunst, br...",[joejohnston],1995,6.894383
2,0,119050,4223284.0,"[romance, comedy]",15602,"[family, wedding, reignites, ancient, feud, ne...",11.7129,11197860.0,101.0,"[waltermatthau, jacklemmon, ann-margret, sophi...",[howarddeutch],1995,6.405793
3,0,2,16000000.0,"[comedy, drama, romance]",31357,"[cheated, mistreated, stepped, woman, holding,...",3.859495,81452160.0,127.0,"[whitneyhouston, angelabassett, lorettadevine,...",[forestwhitaker],1995,5.97252
4,0,96871,4223284.0,[comedy],11862,"[george, bank, recovered, daughter, wedding, r...",8.387519,76578910.0,106.0,"[stevemartin, dianekeaton, martinshort, kimber...",[charlesshyer],1995,5.691207
