# Handle missing data for Keywords (Order 4 - Optional)

In [1]:
import pandas as pd
import ast

df = pd.read_csv('../../datasets/input/keywords.csv')
data = df.copy()

print(f"Dataset loaded: {len(data)} rows, {len(data.columns)} columns")
print(f"Columns: {list(data.columns)}")

Dataset loaded: 46419 rows, 2 columns
Columns: ['id', 'keywords']


# Keywords Missing Data Analysis

In [2]:
print("Columns with missing data:\n")

missing_data = []
for column in data.columns:
    missing_count = data[column].isnull().sum()
    if missing_count > 0:
        missing_pct = (missing_count / len(data)) * 100
        missing_data.append({
            'Column': column,
            'Missing Count': missing_count,
            'Missing %': f"{missing_pct:.2f}%"
        })
        print(f"{column:30} | {missing_count:6} missing ({missing_pct:5.2f}%)")

print(f"\nTotal columns: {len(data.columns)}")
print(f"Columns with missing data: {len(missing_data)}")
print(f"Columns without missing data: {len(data.columns) - len(missing_data)}")

Columns with missing data:


Total columns: 2
Columns with missing data: 0
Columns without missing data: 2


# Step 1: Drop rows with missing id

In [3]:
# Drop rows with missing id (cannot merge without this field)
rows_before = len(data)

data = data.dropna(subset=['id'])

rows_dropped = rows_before - len(data)
print(f"Rows dropped due to missing id: {rows_dropped}")
print(f"Remaining rows: {len(data)}")
print(f"Percentage retained: {(len(data)/rows_before)*100:.2f}%")

# Convert id to integer for proper merging
data['id'] = data['id'].astype(int)

Rows dropped due to missing id: 0
Remaining rows: 46419
Percentage retained: 100.00%


# Step 2: Parse JSON column (keywords)

In [4]:
def parse_keywords(keywords_str):
    """
    Parse keywords JSON string and extract keyword names.
    Returns list of keyword names.
    """
    try:
        if pd.isna(keywords_str):
            return []
        keywords_list = ast.literal_eval(keywords_str)
        return [keyword['name'] for keyword in keywords_list if 'name' in keyword]
    except:
        return []

# Apply parsing function
print("Parsing keywords...")
data['keywords_list'] = data['keywords'].apply(parse_keywords)

print("\n✓ JSON column parsed successfully")
print("\nExample results:")
print(data[['id', 'keywords_list']].head(5))

# Show statistics
keyword_counts = data['keywords_list'].apply(len)
print(f"\nKeyword statistics:")
print(f"  - Average keywords per movie: {keyword_counts.mean():.2f}")
print(f"  - Median keywords per movie: {keyword_counts.median():.0f}")
print(f"  - Movies with no keywords: {(keyword_counts == 0).sum()}")
print(f"  - Movies with keywords: {(keyword_counts > 0).sum()}")

Parsing keywords...

✓ JSON column parsed successfully

Example results:
      id                                      keywords_list
0    862  [jealousy, toy, boy, friendship, friends, riva...
1   8844  [board game, disappearance, based on children'...
2  15602  [fishing, best friend, duringcreditsstinger, o...
3  31357  [based on novel, interracial relationship, sin...
4  11862  [baby, midlife crisis, confidence, aging, daug...

Keyword statistics:
  - Average keywords per movie: 3.42
  - Median keywords per movie: 2
  - Movies with no keywords: 14795
  - Movies with keywords: 31624


# Step 3: Verify id exists in cleaned_movies_metadata

In [5]:
# Load cleaned movies metadata to verify ids
movies_metadata = pd.read_csv('../../datasets/output/cleaned_datasets/cleaned_movies_metadata.csv')
valid_movie_ids = set(movies_metadata['id'].unique())

print(f"Valid movie IDs in metadata: {len(valid_movie_ids)}")

# Keep only keywords for movies that exist in metadata
rows_before_validation = len(data)
data = data[data['id'].isin(valid_movie_ids)]

rows_dropped = rows_before_validation - len(data)
print(f"Rows dropped (id not in movies_metadata): {rows_dropped}")
print(f"Remaining rows: {len(data)}")
print(f"Coverage: {(len(data)/len(valid_movie_ids))*100:.2f}% of movies have keywords data")

Valid movie IDs in metadata: 45430
Rows dropped (id not in movies_metadata): 4
Remaining rows: 46415
Coverage: 102.17% of movies have keywords data


# Step 4: Save cleaned keywords dataset

In [6]:
# Select relevant columns
columns_to_keep = ['id', 'keywords_list']
cleaned_data = data[columns_to_keep].copy()

# Save to CSV
output_path = '../../datasets/output/cleaned_datasets/cleaned_keywords.csv'
cleaned_data.to_csv(output_path, index=False)

print(f"✓ Cleaned keywords dataset saved to: {output_path}")
print(f"Final dataset: {len(cleaned_data)} rows, {len(cleaned_data.columns)} columns")
print(f"\nColumn summary:")
print(f"  - id: Movie identifier (tmdbId)")
print(f"  - keywords_list: List of movie keywords/tags")
print(f"\nSample data:")
print(cleaned_data.head())

# Show some interesting keyword examples
print(f"\nExamples of movies with many keywords:")
data_with_counts = cleaned_data.copy()
data_with_counts['keyword_count'] = data_with_counts['keywords_list'].apply(len)
top_keywords = data_with_counts.nlargest(3, 'keyword_count')
for idx, row in top_keywords.iterrows():
    print(f"  Movie ID {row['id']}: {row['keyword_count']} keywords")

✓ Cleaned keywords dataset saved to: ../../datasets/output/cleaned_datasets/cleaned_keywords.csv
Final dataset: 46415 rows, 2 columns

Column summary:
  - id: Movie identifier (tmdbId)
  - keywords_list: List of movie keywords/tags

Sample data:
      id                                      keywords_list
0    862  [jealousy, toy, boy, friendship, friends, riva...
1   8844  [board game, disappearance, based on children'...
2  15602  [fishing, best friend, duringcreditsstinger, o...
3  31357  [based on novel, interracial relationship, sin...
4  11862  [baby, midlife crisis, confidence, aging, daug...

Examples of movies with many keywords:
  Movie ID 23160: 149 keywords
  Movie ID 117483: 113 keywords
  Movie ID 26390: 97 keywords
