
### This notebook is to preprocess the Rotten Tomatoes dataset.

### For the Rotten Tomatoes dataset, was not necessary print the original dataset. After preprocessing the excluded values are minimal.


In [47]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [48]:
def get_dummies(df: pd.DataFrame, id_col: str, col_to_split: str) -> pd.DataFrame:

    if col_to_split not in df.columns or id_col not in df.columns:
        raise ValueError(f"Input DataFrame must contain '{col_to_split}' and '{id_col}' columns.")

    series_to_split = df[col_to_split].str.split(',')
    exploded_series = series_to_split.explode()
    stripped_series = exploded_series.str.strip()

    stripped_series = stripped_series[stripped_series != '']

    dummies = pd.get_dummies(stripped_series, prefix='', prefix_sep='')
    dummies_df = dummies.groupby(dummies.index).sum()

    result_df = df[[id_col]].join(dummies_df)

    result_df.fillna(0, inplace=True)

    for col in dummies_df.columns:
        result_df[col] = result_df[col].astype(int)

    return result_df

In [49]:
df = pd.read_csv('../data/rotten-tomatoes/rotten_tomatoes_critic_reviews.csv')

In [50]:
df.dropna(subset=['critic_name', 'review_score', 'review_content'], inplace=True)
df.drop_duplicates(inplace=True)
df.shape

(673508, 8)

In [51]:
df = df[df['review_score'].apply(lambda x: "/" in x)]

In [52]:
df['rating'] = df['review_score'].apply(lambda x: (float(x.split('/')[0]) / (float(x.split('/')[1]) + .00001)) * 5.)

In [53]:
df = df[df['rating'] <= 5.1]

In [54]:
df.reset_index(drop=True, inplace=True)

In [55]:
df_filtered = df[df.groupby("critic_name")["critic_name"]
                   .transform("size") >= 10].copy()

In [56]:
df_filtered.reset_index(drop=True, inplace=True)

In [57]:
df_filtered.loc[df_filtered['rating'] < 1, 'rating'] = 1.0

In [58]:
df_filtered['rating'] = df_filtered['rating'].round(1)
df_filtered.rename(columns={'review_content':'text'}, inplace=True)

In [59]:
grouped = df_filtered.groupby('critic_name').size()

In [60]:
lu = LabelEncoder()
li = LabelEncoder()

In [61]:
df_filtered['userId'] = lu.fit_transform(df_filtered['critic_name'])
df_filtered['itemId'] = li.fit_transform(df_filtered['rotten_tomatoes_link'])

In [62]:
df_filtered = df_filtered[['userId', 'itemId', 'rating', 'review_date']]

In [63]:
df = pd.read_csv('../data/rotten-tomatoes/rotten_tomatoes_movies.csv')

In [64]:
df_categorized = get_dummies(df, "rotten_tomatoes_link", "genres")

In [65]:
df_categorized.loc[:, "itemId"] = li.fit_transform(df_categorized['rotten_tomatoes_link'])
df_categorized.drop(columns=["rotten_tomatoes_link"], inplace=True)

In [66]:
merged_df = df_filtered.merge(df_categorized, on='itemId', how='left')

merged_df.to_csv("../data/rotten-tomatoes/rotten_tomatoes.csv", index=False)

In [67]:
df_categorized.columns

Index(['Action & Adventure', 'Animation', 'Anime & Manga',
       'Art House & International', 'Classics', 'Comedy', 'Cult Movies',
       'Documentary', 'Drama', 'Faith & Spirituality', 'Gay & Lesbian',
       'Horror', 'Kids & Family', 'Musical & Performing Arts',
       'Mystery & Suspense', 'Romance', 'Science Fiction & Fantasy',
       'Special Interest', 'Sports & Fitness', 'Television', 'Western',
       'itemId'],
      dtype='object')