In [1]:
import ast
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 140)

In [None]:
CSV_PATH = Path("../data/credits.csv")

df = pd.read_csv(CSV_PATH, low_memory=False)
df.shape, df.head()

((45476, 3),
                                                 cast                                               crew     id
 0  [{'cast_id': 14, 'character': 'Woody (voice)',...  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...    862
 1  [{'cast_id': 1, 'character': 'Alan Parrish', '...  [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...   8844
 2  [{'cast_id': 2, 'character': 'Max Goldman', 'c...  [{'credit_id': '52fe466a9251416c75077a89', 'de...  15602
 3  [{'cast_id': 1, 'character': "Savannah 'Vannah...  [{'credit_id': '52fe44779251416c91011acb', 'de...  31357
 4  [{'cast_id': 1, 'character': 'George Banks', '...  [{'credit_id': '52fe44959251416c75039ed7', 'de...  11862)

In [3]:
print("Shape:", df.shape)

print("\nDtypes:")
display(df.dtypes.to_frame("dtype"))

# --- Regular missing values ---
print("\nMissing values:")
na = df.isna().sum().rename("missing")
display(pd.concat([na, (na/len(df)*100).rename("missing_%").round(2)], axis=1)
        .sort_values("missing_%", ascending=False))

# --- Zero counts in numeric columns ---
print("\nZero counts in numeric columns:")
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if num_cols:
    z = pd.Series({c: int((df[c] == 0).sum()) for c in num_cols}, name="zero_count")
    display(z.sort_values(ascending=False).to_frame())
else:
    print("No numeric columns.")

# --- Additional: check for empty JSON-like structures (cast/crew) ---
def safe_parse(x):
    """Safely parse JSON-like strings into Python objects; return [] if empty/invalid."""
    if pd.isna(x) or not isinstance(x, str) or not x.strip():
        return []
    try:
        return ast.literal_eval(x)
    except Exception:
        return []

json_cols = [c for c in ['cast', 'crew'] if c in df.columns]
if json_cols:
    print("\nEmpty JSON structure checks:")
    results = []
    for col in json_cols:
        df[f'{col}_parsed'] = df[col].apply(safe_parse)
        null_count = df[col].isna().sum()
        blank_count = (df[col].astype(str).str.strip() == '').sum()
        empty_after_parse = (df[f'{col}_parsed'].apply(len) == 0).sum()
        total = len(df)
        results.append({
            'column': col,
            'null_or_blank': int(null_count + blank_count),
            'empty_after_parse': int(empty_after_parse),
            'total_missing_%': round((null_count + blank_count + empty_after_parse) / total * 100, 2)
        })
    display(pd.DataFrame(results))


Shape: (45476, 3)

Dtypes:


Unnamed: 0,dtype
cast,object
crew,object
id,int64



Missing values:


Unnamed: 0,missing,missing_%
cast,0,0.0
crew,0,0.0
id,0,0.0



Zero counts in numeric columns:


Unnamed: 0,zero_count
id,0



Empty JSON structure checks:


Unnamed: 0,column,null_or_blank,empty_after_parse,total_missing_%
0,cast,0,2418,5.32
1,crew,0,771,1.7


In [5]:
import json
import numpy as np
import pandas as pd

def make_hashable(x):
    """Turn lists/dicts/sets/ndarrays into hashable forms for nunique/duplicated."""
    if isinstance(x, (list, dict, set)):
        return json.dumps(x, sort_keys=True)  # stable string
    if isinstance(x, np.ndarray):
        return tuple(x.tolist())
    return x

print("Unique values per column (robust to list/dict values):")
unique_counts = {}
for col in df.columns:
    s = df[col]
    if s.dtype == 'object':
        s_h = s.map(make_hashable)
        unique_counts[col] = s_h.nunique(dropna=False)
    else:
        unique_counts[col] = s.nunique(dropna=False)

uc = pd.Series(unique_counts, name="n_unique").sort_values(ascending=False).to_frame()
display(uc)

# --- Check for fully duplicated rows (convert unhashables first on a copy) ---
df_for_dups = df.copy()
for col in df_for_dups.select_dtypes(include=['object']).columns:
    df_for_dups[col] = df_for_dups[col].map(make_hashable)

dup_rows = int(df_for_dups.duplicated().sum())
print(f"\nDuplicate rows (entire row identical): {dup_rows}")

# --- Check duplicates for specific important columns (id, cast, crew) ---
if 'id' in df.columns:
    dup_ids = int(df['id'].duplicated().sum())
    print(f"Duplicate IDs: {dup_ids}")

if 'cast' in df.columns:
    dup_cast = int(df['cast'].map(make_hashable).duplicated().sum())
    print(f"Duplicate cast entries: {dup_cast}")

if 'crew' in df.columns:
    dup_crew = int(df['crew'].map(make_hashable).duplicated().sum())
    print(f"Duplicate crew entries: {dup_crew}")

# --- Optional: check duplicates on any chosen key ---
KEY = None  # e.g., 'id'
if KEY and KEY in df.columns:
    print(f"Duplicate {KEY}: {int(df[KEY].duplicated().sum())}")

Unique values per column (robust to list/dict values):


Unnamed: 0,n_unique
id,45432
crew_parsed,44669
crew,44669
cast,43019
cast_parsed,43019



Duplicate rows (entire row identical): 37
Duplicate IDs: 44
Duplicate cast entries: 2457
Duplicate crew entries: 807
