In [1]:
import numpy as np
import pandas as pd
import os
import ast

In [7]:
# transform datasets to dfs
def get_datasets_as_dfs(directory: str):
    """ gets datasets from specified directory, transforms into pandas dataframes and returns dictionary of dfs
        Returns: {dataset_names: dataframes}
    """
    dataframes = {}

    for filename in os.listdir(directory):               # get all files in specified dir
        filepath = os.path.join(directory, filename)     # join file name with directory to acess file (directory/filenane)
        df = pd.read_csv(filepath)                      # construct df
        df_name = os.path.splitext(filename)[0]          # remove extension from filename and return file name 
        dataframes[df_name] = df

    return dataframes

dataframes = get_datasets_as_dfs('data')

In [30]:
# Finding common columns to use those for merging dfs
common_columns = set(dataframes['tmdb_5000_credits'].columns).intersection(set(dataframes['tmdb_5000_movies'].columns))
# mergining dfs on the 0eth common column
movies = dataframes['tmdb_5000_credits'].merge(dataframes['tmdb_5000_movies'], on=list(common_columns)[0])

In [31]:
# Auxillary Functions
def extract_genres_keywords(listOf_Dicts):
    """ extracts genres and keywords of every movies from respective column of df (list of dicts).
        Returns: list of genres
    """
    genres = []
    for dictionary in eval(listOf_Dicts):
        genres.append(dictionary['name'])

    return genres

def extract_cast(listOf_Dicts):
    """ extracts top given number of actors for every movie from cast column of df (list of dicts).
        Returns: list of 3 actors
    """
    num_actors = 3
    cast = []
    for i in range(num_actors):
        cast.append(eval(listOf_Dicts)[i]['name'])

    return cast

In [68]:
# 1. Remove unecessary columns
#     necessary columns: 'movie_id', 'title', 'cast', 'crew', 'genres', 'overview', 'keywords'
# 2. check how many missing values in each column, if less, then drop those instances
# 3. drop duplicate instances
# 4. clean and extract genres from genres column

# 1
movies = movies[['movie_id', 'title', 'cast', 'crew', 'genres', 'overview', 'keywords']].copy()   # creating copy to avoid errors
movies.head()
# 2
# movies.isnull().sum()    # checking missing values = 3 in overview column
# movies.dropna(inplace=True)         # dropping instances with missing values
# # 3
# movies.drop_duplicates(inplace=True)
# # 4
# movies['genres'] = movies['genres'].apply(extract_genres_keywords)
# # 5
# movies['keywords'] = movies['keywords'].apply(extract_genres_keywords)
# # 6
# movies['cast'] = movies['cast'].apply(extract_cast)


Unnamed: 0,movie_id,title,cast,crew,genres,overview,keywords
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,"[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,"[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...","[based on novel, mars, medallion, space travel..."


In [28]:
movies['keywords'][1]

['ocean',
 'drug abuse',
 'exotic island',
 'east india trading company',
 "love of one's life",
 'traitor',
 'shipwreck',
 'strong woman',
 'ship',
 'alliance',
 'calypso',
 'afterlife',
 'fighter',
 'pirate',
 'swashbuckler',
 'aftercreditsstinger']