Movie Recommendation Project by "Hello World"

Dataset Exploration Part

In [1]:
#import necessary packages
import pandas as pd
import numpy as np
import ast

In [2]:
#import dataset
path = r"C:\Users\19493\Downloads\tmdb_5000_credits.csv\tmdb_5000_credits.csv"
df = pd.read_csv(path)

In [3]:
#show data
df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
#see missing values
df.isnull().sum()

movie_id    0
title       0
cast        0
crew        0
dtype: int64

In [5]:
#cut the very long columns into parts
def parse_json_column(column_data):
    parsed_data = []
    for item in column_data:
        try:
            parsed_data.append(ast.literal_eval(item))
        except (SyntaxError, ValueError):
            parsed_data.append([])
    return parsed_data

df['cast_parsed'] = parse_json_column(df['cast'])
df['crew_parsed'] = parse_json_column(df['crew'])


In [6]:
#extract useful information from column "cast"
def extract_top_cast_details(cast_list):
    cast_names = []
    cast_characters = []
    male_count = 0
    female_count = 0
    unknown_count = 0

    """
    The elements that I did not extract are: 
    cast_id, credit_id, id, order.
    I don't think these elements could affect users' movie selection.
    
    The elements that I transfered to columns are:
    top 5 casts, top 5 characters and the number of each gender.
    I think these are useful and may be selected to model's features.
    
    """
    for member in cast_list[:5]:  
        name = member.get('name', '')
        character = member.get('character', '')
        gender = member.get('gender', 0)

        if gender == 1:
            female_count += 1
        elif gender == 2:
            male_count += 1
        else:
            unknown_count += 1

        cast_names.append(name)
        cast_characters.append(character)

    return pd.Series([
        ', '.join(cast_names),   
        ', '.join(cast_characters),  
        male_count,              
        female_count,            
        unknown_count            
    ])


df[['top_5_cast', 'top_5_characters', 'num_males', 'num_females', 'num_unknown']] = df['cast_parsed'].apply(extract_top_cast_details)
df.head()

Unnamed: 0,movie_id,title,cast,crew,cast_parsed,crew_parsed,top_5_cast,top_5_characters,num_males,num_females,num_unknown
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...","Sam Worthington, Zoe Saldana, Sigourney Weaver...","Jake Sully, Neytiri, Dr. Grace Augustine, Col....",2,3,0
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","Johnny Depp, Orlando Bloom, Keira Knightley, S...","Captain Jack Sparrow, Will Turner, Elizabeth S...",4,1,0
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...","Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...","James Bond, Blofeld, Madeleine, M, Lucia",3,2,0
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","Christian Bale, Michael Caine, Gary Oldman, An...","Bruce Wayne / Batman, Alfred Pennyworth, James...",4,1,0
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...","Taylor Kitsch, Lynn Collins, Samantha Morton, ...","John Carter, Dejah Thoris, Sola, Tars Tarkas, ...",3,2,0


In [7]:
#extract useful information from column "crew"
def extract_crew_details(crew_list):
    director = ''
    producer = []
    writer = []
    composer = ''
    editor = ''
    cinematographer = ''

    """
    This column lists too many crews and I extract some roles: 
    Director, Produer,  Music composer, Editor, cinematographer.
    
    """
    for member in crew_list:
        job = member.get('job', '')
        name = member.get('name', '')

        if job == 'Director':
            director = name
        elif job == 'Producer':
            producer.append(name)
        elif job == 'Original Music Composer':
            composer = name
        elif job == 'Editor':
            editor = name
        elif job == 'Director of Photography':
            cinematographer = name

    return pd.Series([
        director,                        
        ', '.join(producer),                            
        composer,                         
        editor,                           
        cinematographer                   
    ])
df[['director', 'producers',  'composer', 'editor', 'cinematographer']] = df['crew_parsed'].apply(extract_crew_details)


In [8]:
#see the data
df.head()

Unnamed: 0,movie_id,title,cast,crew,cast_parsed,crew_parsed,top_5_cast,top_5_characters,num_males,num_females,num_unknown,director,producers,composer,editor,cinematographer
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...","Sam Worthington, Zoe Saldana, Sigourney Weaver...","Jake Sully, Neytiri, Dr. Grace Augustine, Col....",2,3,0,James Cameron,"James Cameron, Jon Landau",James Horner,John Refoua,Chiling Lin
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","Johnny Depp, Orlando Bloom, Keira Knightley, S...","Captain Jack Sparrow, Will Turner, Elizabeth S...",4,1,0,Gore Verbinski,"Jerry Bruckheimer, Eric McLeod, Chad Oman, Pet...",Hans Zimmer,Craig Wood,Dariusz Wolski
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...","Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...","James Bond, Blofeld, Madeleine, M, Lucia",3,2,0,Sam Mendes,"Barbara Broccoli, Michael G. Wilson",Thomas Newman,Lee Smith,Hoyte van Hoytema
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","Christian Bale, Michael Caine, Gary Oldman, An...","Bruce Wayne / Batman, Alfred Pennyworth, James...",4,1,0,Christopher Nolan,"Charles Roven, Christopher Nolan, Emma Thomas",Hans Zimmer,Lee Smith,Wally Pfister
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...","Taylor Kitsch, Lynn Collins, Samantha Morton, ...","John Carter, Dejah Thoris, Sola, Tars Tarkas, ...",3,2,0,Andrew Stanton,"Colin Wilson, Jim Morris, Lindsey Collins",,Eric Zumbrunnen,Daniel Mindel


In [9]:
#see missing values
df.isnull().sum()

movie_id            0
title               0
cast                0
crew                0
cast_parsed         0
crew_parsed         0
top_5_cast          0
top_5_characters    0
num_males           0
num_females         0
num_unknown         0
director            0
producers           0
composer            0
editor              0
cinematographer     0
dtype: int64

In [10]:
#check duplications
hashable_columns = [col for col in df.columns if df[col].apply(type).eq(str).all() or df[col].apply(type).eq(int).all()]
num_duplicates = df.duplicated(subset=hashable_columns).sum()
duplicate_rows = df[df.duplicated(subset=hashable_columns)]
print(duplicate_rows)

Empty DataFrame
Columns: [movie_id, title, cast, crew, cast_parsed, crew_parsed, top_5_cast, top_5_characters, num_males, num_females, num_unknown, director, producers, composer, editor, cinematographer]
Index: []


Data preprocessing is done.