In [2]:
from google.colab import files
uploaded = files.upload()

Saving movie_metadata.csv to movie_metadata.csv


In [3]:
"""
WEEK 2: KNOW YOUR DATA
Loading data, checking structure, and generating first impression report
"""

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


# 1. Load Dataset

df = pd.read_csv('movie_metadata.csv')
print("Data loaded successfully\n")



Data loaded successfully



In [4]:
# 2. Basic Overview

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())


# 3. Data Types

print("\nColumn Data Types:")
print(df.dtypes)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumeric columns ({len(numeric_cols)}):")
print(numeric_cols)

print(f"\nCategorical columns ({len(categorical_cols)}):")
print(categorical_cols)



Dataset Shape: (5043, 28)

First 5 rows:
   color      director_name  num_critic_for_reviews  duration  \
0  Color      James Cameron                   723.0     178.0   
1  Color     Gore Verbinski                   302.0     169.0   
2  Color         Sam Mendes                   602.0     148.0   
3  Color  Christopher Nolan                   813.0     164.0   
4    NaN        Doug Walker                     NaN       NaN   

   director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                      0.0                   855.0  Joel David Moore   
1                    563.0                  1000.0     Orlando Bloom   
2                      0.0                   161.0      Rory Kinnear   
3                  22000.0                 23000.0    Christian Bale   
4                    131.0                     NaN        Rob Walker   

   actor_1_facebook_likes        gross                           genres  ...  \
0                  1000.0  760505847.0  Action|Adventur

In [5]:
# 4. Missing Values

missing_data = pd.DataFrame({
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})

missing_data = missing_data[missing_data['Missing_Count'] > 0] \
                .sort_values('Missing_Count', ascending=False)

print("\nMissing Values Summary:")
print(missing_data)

print("\nTotal Missing Values:", df.isnull().sum().sum())



Missing Values Summary:
                         Missing_Count  Missing_Percentage
gross                              884               17.53
budget                             492                9.76
aspect_ratio                       329                6.52
content_rating                     303                6.01
plot_keywords                      153                3.03
title_year                         108                2.14
director_name                      104                2.06
director_facebook_likes            104                2.06
num_critic_for_reviews              50                0.99
actor_3_name                        23                0.46
actor_3_facebook_likes              23                0.46
num_user_for_reviews                21                0.42
color                               19                0.38
duration                            15                0.30
language                            14                0.28
actor_2_name                   

In [6]:
# 5. Statistical Summary

print("\nStatistical Summary (Numeric Columns):")
print(df.describe())




Statistical Summary (Numeric Columns):
       num_critic_for_reviews     duration  director_facebook_likes  \
count             4993.000000  5028.000000              4939.000000   
mean               140.194272   107.201074               686.509212   
std                121.601675    25.197441              2813.328607   
min                  1.000000     7.000000                 0.000000   
25%                 50.000000    93.000000                 7.000000   
50%                110.000000   103.000000                49.000000   
75%                195.000000   118.000000               194.500000   
max                813.000000   511.000000             23000.000000   

       actor_3_facebook_likes  actor_1_facebook_likes         gross  \
count             5020.000000             5036.000000  4.159000e+03   
mean               645.009761             6560.047061  4.846841e+07   
std               1665.041728            15020.759120  6.845299e+07   
min                  0.000000       

In [7]:
# 6. Duplicate Rows

duplicates = df.duplicated().sum()
print("\nDuplicate Rows:", duplicates)


# 7. First Impression Report

print("\nFIRST IMPRESSION REPORT")
print("Total Records:", df.shape[0])
print("Total Features:", df.shape[1])
print("Numeric Features:", len(numeric_cols))
print("Categorical Features:", len(categorical_cols))
print("Columns with Missing Values:", len(missing_data))
print("Ready for cleaning and further analysis.")




Duplicate Rows: 45

FIRST IMPRESSION REPORT
Total Records: 5043
Total Features: 28
Numeric Features: 16
Categorical Features: 12
Columns with Missing Values: 21
Ready for cleaning and further analysis.
