In [1]:
import os
import sys
from datetime import datetime
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import pickle

##### Read last processed movie cleaned data

In [36]:
# Get data directories
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# Define the path to the processed data folder
processed_data_path = parent_dir + '/data/processed/'

# List all files in the processed data folder
files = os.listdir(processed_data_path)

# Regular expression to match the date pattern in the filenames
date_pattern = re.compile(r'movies_cleaned_data_(\d{4}-\d{2}-\d{2})\.csv')

# Extract dates from filenames
dates = []
for file in files:
    match = date_pattern.search(file)
    if match:
        date_str = match.group(1)
        try:
            # Convert date string to datetime object
            date = datetime.strptime(date_str, '%Y-%m-%d')
            dates.append(date)
        except ValueError:
            # Ignore files with invalid date formats
            continue

# Determine the most recent date
if dates:
    last_processed_date = max(dates).strftime('%Y-%m-%d')
    print(f"The last processed date is: {last_processed_date}")
else:
    print("No processed files found.")

movies_cleaned_data_path = f'{processed_data_path}/movies_cleaned_data_{last_processed_date}.csv'

df = pd.read_csv(movies_cleaned_data_path, engine='python', on_bad_lines='skip')

The last processed date is: 2024-07-11


In [11]:
# Drop unnamed column
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns='Unnamed: 0')

In [12]:
df.head()

Unnamed: 0,id,title,overview,genres,popularity,vote_average,release_year,keywords,title_wrangled
0,862,Toy Story,"['led', 'toy', 'live', 'happili', 'room', 'bir...","['anim', 'comedi', 'famili']",21.946943,7.7,1995.0,"['jealousi', 'toy', 'boy', 'friendship', 'frie...","['toy', 'stori']"
1,8844,Jumanji,"['sibl', 'discov', 'enchant', 'board', 'game',...","['adventur', 'fantasi', 'famili']",17.015539,6.9,1995.0,"['board gam', 'disappear', 'based on children ...",['jumanji']
2,15602,Grumpier Old Men,"['famili', 'wed', 'reignit', 'ancient', 'feud'...","['romanc', 'comedi']",11.7129,6.5,1995.0,"['fish', 'best friend', 'duringcreditssting', ...","['grumpier', 'old', 'men']"
3,31357,Waiting to Exhale,"['cheat', 'mistreat', 'step', 'women', 'hold',...","['comedi', 'drama', 'romanc']",3.859495,6.1,1995.0,"['based on novel', 'interracial relationship',...","['wait', 'exhal']"
4,11862,Father of the Bride Part II,"['bank', 'recov', 'daughter', 'wed', 'receiv',...",['comedi'],8.387519,5.7,1995.0,"['babi', 'midlife crisi', 'confid', 'age', 'da...","['father', 'part', 'ii']"


## Normalize Numeric Data

In [23]:
num_cols = ['popularity', 'release_year', 'vote_average']

# Impute none values with mean
for col in num_cols:
    df[col] = df[col].fillna(df[col].mean())

df[num_cols].isnull().sum()

popularity      0
release_year    0
vote_average    0
dtype: int64

In [25]:
# Normalize numeric columns
scaler = MinMaxScaler()

for col in num_cols:
  df[col] = scaler.fit_transform(np.array(df[col]).reshape(-1, 1)).flatten()

normalized_num_features_df = df[num_cols]

In [27]:
normalized_num_features_df.describe()

Unnamed: 0,popularity,release_year,vote_average
count,45493.0,45493.0,45493.0
mean,0.005377,0.807587,0.563237
std,0.010976,0.165315,0.190484
min,0.0,0.0,0.0
25%,0.000732,0.712329,0.5
50%,0.002094,0.869863,0.6
75%,0.006817,0.938356,0.68
max,1.0,1.0,1.0


## Text Representation

In [28]:
text_cols = ['overview','genres','keywords','title_wrangled']

def get_tfidf_matrix(df: pd.DataFrame, text_cols: list):

    # Initialize TF-IDF Vectorizer
    tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)

    # Combine text from specified columns into a single string for each row
    combined_text = df[text_cols].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

    # Apply TF-IDF Vectorizer
    return tf.fit_transform(combined_text)

# Get the TF-IDF matrix
tfidf_matrix = get_tfidf_matrix(df, text_cols)

## Combined Features

In [29]:
combined_features = hstack((tfidf_matrix, normalized_num_features_df))

# Print shapes to verify
print("Shape of tfidf_matrix:", tfidf_matrix.shape)
print("Shape of normalized_num_features:", normalized_num_features_df.shape)
print("Shape of combined_features:", combined_features.shape)

Shape of tfidf_matrix: (45493, 227985)
Shape of normalized_num_features: (45493, 3)
Shape of combined_features: (45493, 227988)


In [34]:
# Save the combined features matrix
with open(f'{processed_data_path}/combined_features.pkl', 'wb') as f:
    pickle.dump(combined_features, f)