In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('../movies.csv')

print(f"Total records: {len(df)}")
print("\n--- Missing Value Analysis ---")
print(df.isnull().sum())

missing_overview = df[df['overview'].isnull()]
print(f"\nMovies with missing overview: {len(missing_overview)}")

print("\n--- Outlier Analysis for vote_count ---")
print(df['vote_count'].describe())
low_vote_count = df[df['vote_count'] < 100]
print(f"Movies with < 100 votes: {len(low_vote_count)}")

print("\n--- Duplicates Analysis ---")
print(f"Number of duplicate IDs: {df.duplicated(subset=['id']).sum()}")

print("\n--- Invalid Data Checks ---")
invalid_runtime = df[(df['runtime'] <= 0) | (df['runtime'].isnull())]
print(f"Movies with invalid runtime: {len(invalid_runtime)}")

invalid_release_date = df[df['release_date'].isnull()]
print(f"Movies with missing release date: {len(invalid_release_date)}")

future_dates = df[pd.to_datetime(df['release_date'], errors='coerce') > pd.Timestamp.today()]
print(f"Movies with future release dates: {len(future_dates)}")

invalid_popularity = df[df['popularity'] < 0]
print(f"Movies with negative popularity: {len(invalid_popularity)}")

print("\n--- Rating Analysis ---")
print(df['vote_average'].describe())
invalid_rating = df[(df['vote_average'] < 0) | (df['vote_average'] > 10)]
print(f"Movies with invalid rating values: {len(invalid_rating)}")

print("\n--- Genre Analysis ---")
missing_genre = df[df['genres'].isnull()]
print(f"Movies with missing genres: {len(missing_genre)}")

print("\n--- Language Distribution ---")
print(df['original_language'].value_counts())

print("\n--- Yearly Release Distribution ---")
df['release_year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year
print(df['release_year'].value_counts().sort_index())
