In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from select_features import FeatureSelectionHandler, SelectFeatures
from handle_missing_values import MissingValueHandler, FillMissingValues

In [3]:
df = pd.read_csv("../data/raw/TMDB_movie_dataset_v11.csv")

In [4]:
df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [5]:
features = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity', 'genres']

In [6]:
feature_selector = FeatureSelectionHandler(SelectFeatures())

In [7]:
df = feature_selector.execute_selection(df, features, save=True, name="df_features_selected")

2025-08-11 16:33:00,389 - INFO - Executing selection with strategy.
2025-08-11 16:33:00,390 - INFO - Selecting 7 features: ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity', 'genres']
2025-08-11 16:33:00,419 - INFO - Saving selected features to: ../data/interm/df_features_selected.csv


# Missing values

In [8]:
df.isnull().sum()

vote_average         0
vote_count           0
revenue              0
runtime              0
budget               0
popularity           0
genres          532576
dtype: int64

`genres` feature has many null values. We will replace them with 'none' to process the entire column later.

In [9]:
missing_handler = MissingValueHandler(FillMissingValues(method='constant', fill_value='none'))

In [10]:
df=missing_handler.handle_missing_values(df, save=True, name="df_not_NA")

2025-08-11 16:33:54,222 - INFO - Executing missing value handling strategy.
2025-08-11 16:33:54,223 - INFO - Filling missing values using method: constant
2025-08-11 16:33:54,303 - INFO - Missing values filled.
2025-08-11 16:33:54,304 - INFO - Saving DataFrame without missing values to: ../data/interm/df_not_NA.csv
