In [65]:
import pandas as pd

def get_dummies(df: pd.DataFrame, id_col: str, col_to_split: str, split_char: str=",") -> pd.DataFrame:

    if col_to_split not in df.columns or id_col not in df.columns:
        raise ValueError(f"Input DataFrame must contain '{col_to_split}' and '{id_col}' columns.")

    series_to_split = df[col_to_split].str.split(split_char)
    exploded_series = series_to_split.explode()
    stripped_series = exploded_series.str.strip()

    stripped_series = stripped_series[stripped_series != '']

    dummies = pd.get_dummies(stripped_series, prefix='', prefix_sep='')
    dummies_df = dummies.groupby(dummies.index).sum()

    result_df = df[[id_col]].join(dummies_df)

    result_df.fillna(0, inplace=True)

    for col in dummies_df.columns:
        result_df[col] = result_df[col].astype(int)

    return result_df

In [66]:
ratings_df = pd.read_csv("../data/ml-100k/ratings.csv")

In [67]:
movies_d = pd.read_csv("../data/ml-100k/movies.csv")

In [68]:
items_categorized_df = get_dummies(movies_d, "movieId", col_to_split="genres", split_char="|")

In [69]:
items_categorized_df.columns

Index(['movieId', '(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [70]:
items_categorized_df.to_csv("../data/ml-100k/items.csv", index=False)

In [71]:
not_data_items = set(list(ratings_df['movieId'].unique())) - set(list(items_categorized_df['movieId'].unique()))

In [72]:
items_df = pd.read_csv("../data/ml-100k/items.csv")

In [73]:
not_data_items = set(ratings_df["movieId"].unique()) - set(
            items_df["movieId"].unique())
print(f"Warning: {len(not_data_items)} items in ratings are missing from items_df metadata.")


