In [151]:
import pandas as pd
import functools as ft
import numpy as np
import matplotlib.pylab as plt
import string
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [152]:
FILEPATH = '../data/movie-set-data/'

In [153]:
credits_df = pd.read_csv(f'{FILEPATH}credits.csv')
keywords_df = pd.read_csv(f'{FILEPATH}keywords.csv')
links_df = pd.read_csv(f'{FILEPATH}links.csv')
links_small_df = pd.read_csv(f'{FILEPATH}links_small.csv')
movie_meta_df = pd.read_csv(f'{FILEPATH}movies_metadata.csv')
ratings_df = pd.read_csv(f'{FILEPATH}ratings.csv')
ratings_small_df = pd.read_csv(f'{FILEPATH}ratings_small.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [154]:
# dropping corrupted rows
filtered_rows = movie_meta_df[movie_meta_df['vote_count'].isnull()].index
movie_meta_df=movie_meta_df.drop(index=filtered_rows)

movie_meta_df['id'] = movie_meta_df.id.astype('int')

In [155]:
# dropping adult rated movies
filtered_rows = movie_meta_df[movie_meta_df['adult']=='True'].index
movie_meta_df = movie_meta_df.drop(index=filtered_rows)

# dropping adult column no longer needed
movie_meta_df = movie_meta_df.drop(columns='adult')

In [156]:
# dropping movies that are not already Released
filtered_rows = movie_meta_df[movie_meta_df['status']!='Released'].index
movie_meta_df = movie_meta_df.drop(index=filtered_rows)

# dropping status column no longer needed
movie_meta_df = movie_meta_df.drop(columns='status')

In [157]:
movie_meta_df['video'].unique()

array([False, True], dtype=object)

In [158]:
# dropping movies that are not already Released
filtered_rows = movie_meta_df[movie_meta_df['video']==True].index
movie_meta_df = movie_meta_df.drop(index=filtered_rows)

# dropping status column no longer needed
movie_meta_df = movie_meta_df.drop(columns='video')

# Cleaning JSON columns

In [159]:
# merging datasets for text mining
dataframes = [keywords_df,credits_df,movie_meta_df]
text_df = ft.reduce(lambda left, right: pd.merge(left, right, on='id'), dataframes)

# dropping columns not needed
text_df = text_df.drop(columns=['belongs_to_collection','homepage','imdb_id','poster_path','original_title'])
text_df.rename(columns={'id':'movieId'}, inplace=True)

In [160]:
small_text_df = pd.DataFrame(text_df[['movieId', 'keywords', 'cast', 'crew', 'genres',
                     'popularity', 'production_companies',
                     'production_countries', 'revenue', 'runtime',
                     'vote_average', 'vote_count']])

In [161]:
# dropping duplicates and missing values prior to vectorizing
small_text_df.drop_duplicates(keep=False,inplace=True)
small_text_df.dropna(inplace=True)

In [162]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [163]:
# Function to remove punctuation
def remove_punctuation(text):
    removed_punctuation ="".join([char for char in str(text) if char not in string.punctuation])
    return removed_punctuation

# apply function to strip punctuations for specified columns
small_text_df['keywords'] = small_text_df['keywords'].apply(lambda x: remove_punctuation(x))
small_text_df['cast'] = small_text_df['cast'].apply(lambda x: remove_punctuation(x))
small_text_df['crew'] = small_text_df['crew'].apply(lambda x: remove_punctuation(x))
small_text_df['genres'] = small_text_df['genres'].apply(lambda x: remove_punctuation(x))
small_text_df['production_companies'] = small_text_df['production_companies'].apply(lambda x: remove_punctuation(x))
small_text_df['production_countries'] = small_text_df['production_countries'].apply(lambda x: remove_punctuation(x))

Tokenizing Words

In [164]:
#function to tokenize the text
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

# add new column 'tokenized' to df
small_text_df['keywords'] = small_text_df['keywords'].apply(lambda x: tokenize(x))
small_text_df['cast'] = small_text_df['cast'].apply(lambda x: tokenize(x))
small_text_df['crew'] = small_text_df['crew'].apply(lambda x: tokenize(x))
small_text_df['genres'] = small_text_df['genres'].apply(lambda x: tokenize(x))
small_text_df['production_companies'] = small_text_df['production_companies'].apply(lambda x: tokenize(x))
small_text_df['production_countries'] = small_text_df['production_countries'].apply(lambda x: tokenize(x))

Removing Stop Words utilizing spacy word list

In [165]:
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
stop_words = en.Defaults.stop_words
# adding id to stop word list due to high repetition
stop_words.add('id')
print(stop_words)

{'whatever', 'their', 'everywhere', 'afterwards', 'without', 'would', 'becoming', 'name', 'such', 'they', 'except', 'empty', 'should', 'third', 'few', 'anyhow', 'less', 'together', 'this', 'front', 'thereby', 'she', 'former', 'meanwhile', 'further', 'once', '’ll', 'get', 'when', 'moreover', 'give', 'but', 'done', 'neither', 'ourselves', 'thru', 'can', 'its', 'nothing', "'re", 'does', 'could', '‘re', 'various', '‘s', 'yet', 'above', 'herself', 'six', 'his', 'myself', 'wherever', '‘ve', 'quite', 'seemed', 'mine', 'noone', 'others', 'full', 'themselves', 'thus', 'part', 'nevertheless', 'therein', 'everyone', 'was', 'most', 'who', 'him', 'again', 'itself', 'go', 'whereas', 'somehow', 'nobody', 'ours', 'whose', '’m', 'also', 'during', 'be', 'just', 'about', 'some', 'elsewhere', 'thence', 'it', 'twelve', 'three', 'other', 'serious', 'along', 'behind', 'these', 'sometime', 'several', 'hereupon', 'which', 'show', 'twenty', 'hundred', 'keep', 'otherwise', 'really', 'something', 'to', 'yourself'

In [166]:
# function to remove stopwords
def remove_stopwords(tokenized_list):
    dropped_stopwords = [word for word in tokenized_list if word not in stop_words]
    return dropped_stopwords

# add new column 'tokenized' to df
small_text_df['keywords'] = small_text_df['keywords'].apply(lambda x: remove_stopwords(x))
small_text_df['cast'] = small_text_df['cast'].apply(lambda x: remove_stopwords(x))
small_text_df['crew'] = small_text_df['crew'].apply(lambda x: remove_stopwords(x))
small_text_df['genres'] = small_text_df['genres'].apply(lambda x: remove_stopwords(x))
small_text_df['production_companies'] = small_text_df['production_companies'].apply(lambda x: remove_stopwords(x))
small_text_df['production_countries'] = small_text_df['production_countries'].apply(lambda x: remove_stopwords(x))

# Use avg rating to fill for movies?

In [167]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [168]:
ratings_df.drop(columns=['userId', 'timestamp'], inplace=True)

In [169]:
avg_ratings = round(ratings_df.groupby('movieId').mean(),2)

In [170]:
final_df = pd.merge(small_text_df,avg_ratings,on='movieId')
final_df.rename(columns={'rating':'avg_rating'}, inplace=True)

In [171]:
final_df.to_csv('../data/iac_dataset/modeling_data.csv',index=False)