In [106]:
from pathlib import Path

import pandas as pd
from numpy import nan, ndarray
from pandas import DataFrame
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    MultiLabelBinarizer,
    OneHotEncoder,
)
from sklearn.metrics.pairwise import cosine_similarity

In [107]:
df = pd.read_parquet("data/01_raw/movies_dataset_2025-05-07.parquet")

In [108]:
use_features: list[str] = [
    "title",
    "original_language",
    "popularity",
    "vote_average",
    "vote_count",
    "is_popular",
    "runtime",
    "budget",
    "revenue",
    "genres",
    "spoken_languages",
]

df = df[use_features]

In [109]:
df.drop_duplicates(
    subset=[
        "original_language",
        "popularity",
        "vote_average",
        "vote_count",
        "is_popular",
        "runtime",
        "budget",
        "revenue",
    ],
    inplace=True,
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(


In [110]:
cat_cols = ["original_language"]
df[cat_cols] = df[cat_cols].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cat_cols] = df[cat_cols].astype("category")


In [111]:
df["is_popular"] = df["is_popular"].astype("int8")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["is_popular"] = df["is_popular"].astype("int8")


In [112]:
# ruff: noqa: RUF001
spoken_languages_mappings = {
    "Français": "European (Romance)",
    "Español": "European (Romance)",
    "English": "European (Germanic)",
    "Deutsch": "European (Germanic)",
    "हिन्दी": "South Asian",
    "广州话 / 廣州話": "East Asian",
    "日本語": "East Asian",
    "Italiano": "European (Romance)",
    "Pусский": "European (Slavic)",
    "Nederlands": "European (Germanic)",
    "isiZulu": "African",
    "ภาษาไทย": "Southeast Asian",
    "普通话": "East Asian",
    "Bahasa indonesia": "Southeast Asian",
    "": "Unknown/Other",
    "தமிழ்": "South Asian",
    "suomi": "European (Other)",
    "한국어/조선말": "East Asian",
    "български език": "European (Slavic)",
    "Català": "European (Romance)",
    "Türkçe": "Middle Eastern/Central Asian",
    "Português": "European (Romance)",
    "Norsk": "European (Germanic)",
    "Dansk": "European (Germanic)",
    "svenska": "European (Germanic)",
    "Lietuvių": "European (Other)",
    "Polski": "European (Slavic)",
    "తెలుగు": "South Asian",
    "עִבְרִית": "Middle Eastern/Central Asian",
    "Український": "European (Slavic)",
    "Latin": "European (Other)",  # Could also be considered Historical
    "?????": "Unknown/Other",
    "No Language": "Unknown/Other",
    "اردو": "South Asian",
    "العربية": "Middle Eastern/Central Asian",
    "Română": "European (Romance)",
    "Íslenska": "European (Germanic)",
    "Magyar": "European (Other)",
    "فارسی": "Middle Eastern/Central Asian",
    "Bahasa melayu": "Southeast Asian",
    "Galego": "European (Romance)",
    "ქართული": "European (Other)",  # Kartvelian is a unique family, grouped here for simplicity
    "euskera": "European (Other)",  # Language Isolate, grouped here
    "Èdè Yorùbá": "African",
    "Wolof": "African",
    "Gaeilge": "European (Other)",  # Celtic, grouped here
    "Hrvatski": "European (Slavic)",
    "ελληνικά": "European (Other)",  # Hellenic, grouped here
    "Slovenčina": "European (Slavic)",
    "πੰਜਾਬੀ": "South Asian",
    "Český": "European (Slavic)",
    "Tiếng Việt": "Southeast Asian",
    "Fulfulde": "African",
    "қазақ": "Middle Eastern/Central Asian",
    "Esperanto": "Unknown/Other",  # Constructed language
    "Èʋegbe": "African",
    "বাংলা": "South Asian",
    "پښتو": "Middle Eastern/Central Asian",
    "shqip": "European (Other)",  # Albanian, grouped here
    "Srpski": "European (Slavic)",
    "Afrikaans": "European (Germanic)",
    "Kiswahili": "African",
    "Eesti": "European (Other)",  # Uralic, grouped here
    "Slovenščina": "European (Slavic)",
    "Bamanankan": "African",
    "Azərbaycan": "Middle Eastern/Central Asian",
    "Bosanski": "European (Slavic)",
    "සිංහල": "South Asian",
    "Latviešu": "European (Other)",  # Baltic, grouped here
    "Malti": "Middle Eastern/Central Asian",
    nan: "Unknown/Other",
}

In [113]:
original_language_mappings = {
    "fr": "European (Romance)",
    "es": "European (Romance)",
    "en": "European (Germanic)",
    "te": "South Asian",
    "de": "European (Germanic)",
    "hi": "South Asian",
    "ja": "East Asian",
    "nl": "European (Germanic)",
    "th": "Southeast Asian",
    "id": "Southeast Asian",
    "ht": "European (Romance)",
    "it": "European (Romance)",
    "ta": "South Asian",
    "ml": "South Asian",
    "fi": "European (Other)",
    "ko": "East Asian",
    "bg": "European (Slavic)",
    "ca": "European (Romance)",
    "pt": "European (Romance)",
    "tr": "Middle Eastern/Central Asian",
    "no": "European (Germanic)",
    "tl": "Southeast Asian",
    "da": "European (Germanic)",
    "zu": "African",
    "sv": "European (Germanic)",
    "pl": "European (Slavic)",
    "uk": "European (Slavic)",
    "zh": "East Asian",
    "ru": "European (Slavic)",
    "kn": "South Asian",
    "xx": "Unknown/Other",
    "cn": "East Asian",
    "ar": "Middle Eastern/Central Asian",
    "hu": "European (Other)",
    "fa": "Middle Eastern/Central Asian",
    "mn": "East Asian",
    "yo": "African",
    "ro": "European (Romance)",
    "sk": "European (Slavic)",
    "jv": "Southeast Asian",
    "cs": "European (Slavic)",
    "ur": "South Asian",
    "pa": "South Asian",
    "is": "European (Germanic)",
    "hr": "European (Slavic)",
    "vi": "Southeast Asian",
    "lv": "European (Other)",
    "km": "Southeast Asian",
    "ms": "Southeast Asian",
    "kk": "Middle Eastern/Central Asian",
    "ka": "European (Other)",
    "ga": "European (Other)",
    "xh": "African",
    "ig": "African",
    "el": "European (Other)",
    "bn": "South Asian",
    "tt": "Middle Eastern/Central Asian",
    "gl": "European (Romance)",
    "mk": "European (Slavic)",
    "bo": "East Asian",
    "dz": "South Asian",
    "he": "Middle Eastern/Central Asian",
    "sr": "European (Slavic)",
    "ff": "African",
    "gu": "South Asian",
    "ab": "European (Other)",
    "et": "European (Other)",
    "kl": "European (Other)",
    "lt": "European (Other)",
    "se": "European (Other)",
    "eu": "European (Other)",
    "bs": "European (Slavic)",
    "lb": "European (Germanic)",
    "mi": "Southeast Asian",
    "hy": "Middle Eastern/Central Asian",
    "su": "Southeast Asian",
    "mt": "Middle Eastern/Central Asian",
    "sl": "European (Slavic)",
}

In [114]:
def get_features_names(_, feature_names) -> ndarray:
    return feature_names


def map_lang(X: DataFrame, col: str, mappings: dict[str, str]) -> DataFrame:
    """
    Map language codes to broader categories.
    """
    X = X.copy()
    if X[col].dtype.name == "object":  # if it's a list
        X[col] = X[col].apply(lambda x: [mappings.get(item, "Unknown/Other") for item in x])
        return X
    X[col] = X[col].map(mappings)
    return X

In [115]:
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    """A custom transformer to apply MultiLabelBinarizer within a scikit-learn pipeline.

    This transformer is designed to be used with `ColumnTransformer` on a single
    column of a pandas DataFrame that contains lists of labels (multi-label data).
    It wraps the functionality of `sklearn.preprocessing.MultiLabelBinarizer` and
    provides a `get_feature_names_out` method compatible with scikit-learn pipelines.
    """

    def __init__(self):
        """Initializes the MultiLabelBinarizerTransformer."""
        # Initialize MultiLabelBinarizer here, but don't fit it yet
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        """Fits the MultiLabelBinarizer on the input data.

        Args:
            X: A pandas DataFrame slice with one column containing lists of labels.
            y: Ignored.

        Returns:
            self: Returns the instance itself.
        """
        # X will be a DataFrame slice with one column (e.g., df[['genres']])
        # Fit MultiLabelBinarizer on the values of that column
        self.mlb.fit(X.iloc[:, 0])
        return self

    def transform(self, X):
        """Transforms the input data using the fitted MultiLabelBinarizer.

        Args:
            X: A pandas DataFrame slice with one column containing lists of labels.

        Returns:
            numpy.ndarray: A sparse matrix representing the binarized labels.
        """
        # Transform the values of the column
        return self.mlb.transform(X.iloc[:, 0])

    def get_feature_names_out(self, input_features=None):
        """Gets the output feature names after binarization.

        Args:
            input_features: Ignored.

        Returns:
            list: A list of strings representing the output feature names (the labels).
        """
        # Return the classes learned by the fitted MultiLabelBinarizer
        return self.mlb.classes_.tolist()

In [116]:
num_cols: list[str] = [
    "popularity",
    "vote_average",
    "vote_count",
    "runtime",
    "budget",
    "revenue",
    "is_popular",
]
cat_cols: list[str] = ["original_language"]
multi_label_cat_cols: list[str] = ["genres", "spoken_languages"]

In [117]:

num_pipe = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
cat_pipe = Pipeline(
    steps=[
        (
            "language mapper",
            FunctionTransformer(
                map_lang,
                kw_args={
                    "col": "original_language",
                    "mappings": original_language_mappings,
                },
                feature_names_out=get_features_names,
            ),
        ),
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one-hot", OneHotEncoder(drop="first")),
    ]
)

multi_label_genres_pipe = Pipeline(steps=[("binarizer", MultiLabelBinarizerTransformer())])

multi_label_spoken_languages_pipe = Pipeline(
    steps=[
        (
            "language mapper",
            FunctionTransformer(
                map_lang,
                kw_args={
                    "col": "spoken_languages",
                    "mappings": spoken_languages_mappings,
                },
                feature_names_out=get_features_names,
            ),
        ),
        ("binarizer", MultiLabelBinarizerTransformer()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
        ("genres", multi_label_genres_pipe, ["genres"]),
        ("spoken_languages", multi_label_spoken_languages_pipe, ["spoken_languages"]),
    ],
)

In [118]:
preprocessor

In [119]:
preprocessed = preprocessor.fit_transform(df)
feature_names = preprocessor.get_feature_names_out()
preprocessed = pd.DataFrame(preprocessed, columns=feature_names)

In [120]:
def get_recommendations(
    title: str,
    cosine_sim: ndarray,
    df:DataFrame,
    sim_movies: int = 10
) -> DataFrame:
  try:
    idx = df.index[df["title"] == title].tolist()[0]
  except IndexError:
    print(f"No movie found with title: {title}")
    return DataFrame()
  sim_scores: list = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:sim_movies + 1]
  movies_idx: list = [i[0] for i in sim_scores]
  return df.iloc[movies_idx]


In [121]:
class CosineSimilarityRecommender(BaseEstimator, TransformerMixin):
    """
    A scikit-learn compatible transformer/estimator for cosine similarity based recommendation.
    """
    def __init__(self, sim_movies: int = 10):
        """
        Initializes the CosineSimilarityRecommender.
        """
        self.sim_movies = sim_movies
        self.cosine_sim_matrix = None
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        """
        Calculates the cosine similarity matrix from the input data.

        Args:
            X: The input data (should be the preprocessed numerical data).
            y: Ignored.

        Returns:
            self: Returns the instance itself.
        """
        if not isinstance(X, (ndarray, pd.DataFrame)):
             raise TypeError("Input X must be a numpy array or pandas DataFrame.")

        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()

        self.cosine_sim_matrix = cosine_similarity(X, X)

        return self

    def transform(self, X):
        """
        This method is not typically used for making recommendations in a pipeline
        context, but is required for TransformerMixin.
        You might return the similarity matrix or raise an error depending on use case.
        Here, we'll return the similarity matrix.
        """
        if self.cosine_sim_matrix is None:
             raise RuntimeError("The recommender has not been fitted yet. Call fit() first.")
        return self.cosine_sim_matrix

    def predict(self, X, movie_title: str) -> pd.DataFrame:
        """
        Provides movie recommendations based on cosine similarity for a given movie title.

        Args:
            X: The input data (the original DataFrame including 'title').
            movie_title: The title of the movie for which to get recommendations.
            sim_movies: The number of similar movies to recommend.

        Returns:
            DataFrame: A DataFrame containing the recommended movies.
        """
        if self.cosine_sim_matrix is None:
             raise RuntimeError("The recommender has not been fitted yet. Call fit() first.")
        try:
            # Get the index of the movie that matches the title from the original DataFrame
            idx = X.index[X['title'] == movie_title].tolist()[0]
        except IndexError:
            print(f"No movie found with title: {movie_title}")
            return pd.DataFrame()
        sim_scores = list(enumerate(self.cosine_sim_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:self.sim_movies + 1]
        movie_indices: list = [i[0] for i in sim_scores]
        return X.iloc[movie_indices]

In [122]:
model_pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("recommender", CosineSimilarityRecommender()),
    ]
)

In [123]:
model_pipe.fit(df)
model_pipe.predict(df, "Star")

TypeError: Pipeline.predict() takes 2 positional arguments but 3 were given

In [None]:
cosine_sim = cosine_similarity(preprocessed, preprocessed)

In [None]:
get_recommendations("Star", cosine_sim, df, 2)