In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from data_cleaning import CleanHandler

In [3]:
df = pd.read_csv("../data/raw/TMDB_movie_dataset_v11.csv")

In [4]:
df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [5]:
features = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity', 'genres']

In [6]:
df=CleanHandler.select_features(df, features, save=True, name='df_features_selected')

2025-08-11 17:18:34,103 - INFO - Selecting 7 features: ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity', 'genres']
2025-08-11 17:18:34,127 - INFO - Saving DataFrame to: ../data/interm/df_features_selected.csv


# Missing values

In [7]:
df.isnull().sum()

vote_average         0
vote_count           0
revenue              0
runtime              0
budget               0
popularity           0
genres          532576
dtype: int64

`genres` feature has many null values. We will replace them with 'none' to process the entire column later.

In [8]:
df = CleanHandler.clean_missing_values(df, method='constant', fill_value='none', save=True, name='df_not_NA')

2025-08-11 17:18:36,093 - INFO - Starting missing value cleaning with method: 'constant'
2025-08-11 17:18:36,174 - INFO - Missing value cleaning process completed.
2025-08-11 17:18:36,175 - INFO - Saving DataFrame to: ../data/interm/df_not_NA.csv


In [9]:
df = CleanHandler.remove_invalid_rows(df, save = True, name='invalid_rows_remove')

2025-08-11 17:18:37,968 - INFO - Removing rows with invalid numeric values.
2025-08-11 17:18:37,974 - INFO - Saving DataFrame to: ../data/interm/invalid_rows_remove.csv
