In [1]:
import pandas as pd
import numpy as np

from sklearn.discriminant_analysis import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Set random seed
seed = 0

In [2]:
# Read cleaned data from csv
df = pd.read_csv('input/movie_data_tmbd_cleaned.csv', sep='|')

In [3]:
# Print info about the data
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 8784 non-null   int64  
 1   budget                8784 non-null   int64  
 2   genres                8780 non-null   object 
 3   original_language     8784 non-null   object 
 4   overview              8763 non-null   object 
 5   popularity            8784 non-null   float64
 6   production_companies  8618 non-null   object 
 7   production_countries  8737 non-null   object 
 8   revenue               8784 non-null   int64  
 9   runtime               8784 non-null   float64
 10  spoken_languages      8768 non-null   object 
 11  status                8784 non-null   object 
 12  tagline               6912 non-null   object 
 13  title                 8784 non-null   object 
 14  video                 8784 non-null   int64  
 15  vote_average         

In [4]:
print(df['video'].value_counts())
print(df['adult'].value_counts())
# These columns will be dropped as they are not useful

video
0    8778
1       6
Name: count, dtype: int64
adult
0    8784
Name: count, dtype: int64


In [5]:
def encode_column(column, min_apperances):
    df[column] = df[column].replace(np.nan, '')
    unique_values = df[column].str.split(',').explode()
    top_values = unique_values.value_counts()[unique_values.value_counts() > min_apperances]
    df[column] = df[column].apply(lambda x: [''.join(i.split()) for i in x.split(',') if i in top_values.index]).apply(lambda x: ','.join(x))

In [6]:
encode_column('cast', 10)
encode_column('production_companies', 10)
# encode_column('production_countries', 10)
# encode_column('genres', 10)
encode_column('directors', 10)
df['spoken_languages'] = df['spoken_languages'].replace(np.nan, '')
df['tagline'] = df['tagline'].replace(np.nan, '')
df['production_countries'] = df['production_countries'].replace(np.nan, '')
df['genres'] = df['genres'].replace(np.nan, '')
df['overview'] = df['overview'].replace(np.nan, '')

In [None]:
numerical_features = ['budget', 'revenue', 'popularity', 'runtime', 'vote_count', 'release_year', 'release_month']
categorical_features = ['original_language', 'status']
boolean_features = ['adult', 'video']
text_features = ['overview', 'tagline', 'title']
list_based_features = ['genres', 'production_companies', 'production_countries', 'cast', 'directors']

# 1. Numerical transformer: Impute missing values and standardize
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 2. Categorical transformer: Impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 3. Boolean transformer: Ensure 0/1 values
boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# 4. Text transformer: Use TF-IDF for text columns
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=100))  
])

# 5. List-based transformer: Convert list of items into binary (multi-hot) encoding

list_transformer = Pipeline(steps=[
    ('countvec', CountVectorizer(max_features=100))  # MultiLabelBinarizer to handle list columns as binary-encoded features
])
# Combine all transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('overview', text_transformer, 'overview'),
        ('title', text_transformer, 'title'),
        ('tagline', text_transformer, 'tagline'),
        ('genres', list_transformer, 'genres'),
        ('production_companies', list_transformer, 'production_companies'),
        ('production_countries', list_transformer, 'production_countries'),
        ('cast', list_transformer, 'cast'),
        ('directors', list_transformer, 'directors')
    ]
)


In [8]:
X = df.drop(columns=['vote_average', 'category'])
X = df.drop(columns=['video', 'adult', 'release_day'])
Y = df['category']

In [9]:
new_X = preprocessor.fit_transform(X)
new_X.shape

(8784, 734)

In [10]:
import scipy

print(type(new_X))
# Convert to dense if it's sparse
if isinstance(new_X, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)):
    new_X = new_X.toarray()

<class 'scipy.sparse._csr.csr_matrix'>


In [11]:
# Make a df from new_X and Y
new_df = pd.DataFrame(new_X, columns=preprocessor.get_feature_names_out())

In [12]:
new_df.index = df.index  # This aligns the index of new_df with df

# Concatenate new_df and the 'category' column
merged_df = pd.concat([new_df, df['category']], axis=1)

print(merged_df.head())

   num__budget  num__revenue  num__popularity  num__runtime  num__vote_count  \
0     4.757463      1.005269         0.425819      0.448734         1.716450   
1     2.894642      1.159981         0.366180     -0.230573         1.636140   
2     3.826052      2.100421         0.503760      0.739865         1.420090   
3     2.628524      0.670763         0.183190     -0.230573         1.375134   
4     1.510832      0.743000         0.159881     -0.085007         0.961366   

   num__release_year  num__release_month  cat__original_language_ar  \
0           0.483259           -0.241059                        0.0   
1           0.605583           -0.533445                        0.0   
2           0.789068           -0.241059                        0.0   
3           0.727906            0.343713                        0.0   
4          -0.006037           -1.410603                        0.0   

   cat__original_language_bn  cat__original_language_bs  ...  \
0                        0.0

In [13]:
# Save the cleaned data to input/movie_data_tmbd_cleaned.csv
merged_df.to_csv('input/movie_data_encoded.csv', sep='|', index=False)