# Handle missing data for Credits (Order 4 - Optional)

In [1]:
import pandas as pd
import ast

df = pd.read_csv('../../datasets/input/credits.csv')
data = df.copy()

print(f"Dataset loaded: {len(data)} rows, {len(data.columns)} columns")
print(f"Columns: {list(data.columns)}")

Dataset loaded: 45476 rows, 3 columns
Columns: ['cast', 'crew', 'id']


# Credits Missing Data Analysis

In [2]:
print("Columns with missing data:\n")

missing_data = []
for column in data.columns:
    missing_count = data[column].isnull().sum()
    if missing_count > 0:
        missing_pct = (missing_count / len(data)) * 100
        missing_data.append({
            'Column': column,
            'Missing Count': missing_count,
            'Missing %': f"{missing_pct:.2f}%"
        })
        print(f"{column:30} | {missing_count:6} missing ({missing_pct:5.2f}%)")

print(f"\nTotal columns: {len(data.columns)}")
print(f"Columns with missing data: {len(missing_data)}")
print(f"Columns without missing data: {len(data.columns) - len(missing_data)}")

Columns with missing data:


Total columns: 3
Columns with missing data: 0
Columns without missing data: 3


# Step 1: Drop rows with missing id

In [3]:
# Drop rows with missing id (cannot merge without this field)
rows_before = len(data)

data = data.dropna(subset=['id'])

rows_dropped = rows_before - len(data)
print(f"Rows dropped due to missing id: {rows_dropped}")
print(f"Remaining rows: {len(data)}")
print(f"Percentage retained: {(len(data)/rows_before)*100:.2f}%")

# Convert id to integer for proper merging
data['id'] = data['id'].astype(int)

Rows dropped due to missing id: 0
Remaining rows: 45476
Percentage retained: 100.00%


# Step 2: Parse JSON columns (cast and crew)

In [4]:
def parse_cast(cast_str, top_n=5):
    """
    Parse cast JSON string and extract top N actor names.
    Returns list of actor names ordered by billing order.
    """
    try:
        if pd.isna(cast_str):
            return []
        cast_list = ast.literal_eval(cast_str)
        # Sort by 'order' field and take top N
        sorted_cast = sorted(cast_list, key=lambda x: x.get('order', 999))
        return [actor['name'] for actor in sorted_cast[:top_n] if 'name' in actor]
    except:
        return []

def parse_crew_directors(crew_str):
    """
    Parse crew JSON string and extract director names.
    Returns list of directors.
    """
    try:
        if pd.isna(crew_str):
            return []
        crew_list = ast.literal_eval(crew_str)
        # Extract only directors
        directors = [member['name'] for member in crew_list 
                    if member.get('job') == 'Director' and 'name' in member]
        return directors
    except:
        return []

# Apply parsing functions
print("Parsing cast (extracting top 5 actors)...")
data['cast_list'] = data['cast'].apply(parse_cast)

print("Parsing crew (extracting directors)...")
data['director_list'] = data['crew'].apply(parse_crew_directors)

print("\n✓ JSON columns parsed successfully")
print("\nExample results:")
print(data[['id', 'cast_list', 'director_list']].head(3))

Parsing cast (extracting top 5 actors)...
Parsing crew (extracting directors)...

✓ JSON columns parsed successfully

Example results:
      id                                          cast_list    director_list
0    862  [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...  [John Lasseter]
1   8844  [Robin Williams, Jonathan Hyde, Kirsten Dunst,...   [Joe Johnston]
2  15602  [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...  [Howard Deutch]


# Step 3: Verify id exists in cleaned_movies_metadata

In [5]:
# Load cleaned movies metadata to verify ids
movies_metadata = pd.read_csv('../../datasets/output/cleaned_datasets/cleaned_movies_metadata.csv')
valid_movie_ids = set(movies_metadata['id'].unique())

print(f"Valid movie IDs in metadata: {len(valid_movie_ids)}")

# Keep only credits for movies that exist in metadata
rows_before_validation = len(data)
data = data[data['id'].isin(valid_movie_ids)]

rows_dropped = rows_before_validation - len(data)
print(f"Rows dropped (id not in movies_metadata): {rows_dropped}")
print(f"Remaining rows: {len(data)}")
print(f"Coverage: {(len(data)/len(valid_movie_ids))*100:.2f}% of movies have credits data")

Valid movie IDs in metadata: 45430
Rows dropped (id not in movies_metadata): 3
Remaining rows: 45473
Coverage: 100.09% of movies have credits data


# Step 4: Save cleaned credits dataset

In [6]:
# Select relevant columns
columns_to_keep = ['id', 'cast_list', 'director_list']
cleaned_data = data[columns_to_keep].copy()

# Save to CSV
output_path = '../../datasets/output/cleaned_datasets/cleaned_credits.csv'
cleaned_data.to_csv(output_path, index=False)

print(f"✓ Cleaned credits dataset saved to: {output_path}")
print(f"Final dataset: {len(cleaned_data)} rows, {len(cleaned_data.columns)} columns")
print(f"\nColumn summary:")
print(f"  - id: Movie identifier (tmdbId)")
print(f"  - cast_list: Top 5 actors")
print(f"  - director_list: Director(s)")
print(f"\nSample data:")
print(cleaned_data.head())

✓ Cleaned credits dataset saved to: ../../datasets/output/cleaned_datasets/cleaned_credits.csv
Final dataset: 45473 rows, 3 columns

Column summary:
  - id: Movie identifier (tmdbId)
  - cast_list: Top 5 actors
  - director_list: Director(s)

Sample data:
      id                                          cast_list      director_list
0    862  [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...    [John Lasseter]
1   8844  [Robin Williams, Jonathan Hyde, Kirsten Dunst,...     [Joe Johnston]
2  15602  [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...    [Howard Deutch]
3  31357  [Whitney Houston, Angela Bassett, Loretta Devi...  [Forest Whitaker]
4  11862  [Steve Martin, Diane Keaton, Martin Short, Kim...    [Charles Shyer]
