In [61]:
import pandas as pd
import os
from sklearn.preprocessing import MultiLabelBinarizer

In [62]:
# create a df from all the csv files in raw folder

df = pd.DataFrame()

for file in os.listdir("./raw"):
    if file.endswith(".csv"):
        df = pd.concat([df, pd.read_csv("./raw/" + file)])

In [63]:
col_to_drop = ["movie_id", "certificate", "director_id", "star", "star_id", "gross(in $)", "description"]

for _ in col_to_drop:
    df = df.drop(_, axis=1)

In [64]:
col_to_rename = {"movie_name": "Original Title", 
                    "year": "Year", 
                    "runtime": "Runtime (mins)", 
                    "rating": "IMDb Rating", 
                    "votes": "Num Votes"}

for _ in col_to_rename:
    df = df.rename(columns={_: col_to_rename[_]})

In [65]:
# drop rows with missing values

df = df.dropna()

# reset index

df = df.reset_index(drop=True)

In [66]:
df["genre"] = df["genre"].str.split(", ")
df["genre"] = df["genre"].apply(lambda x: x if isinstance(x, list) else [])

mlb = MultiLabelBinarizer()
genres_one_hot = mlb.fit_transform(df["genre"])

genres_df = pd.DataFrame(genres_one_hot, columns=mlb.classes_)

df = pd.concat([df, genres_df], axis=1)

df = df.drop(["genre"], axis=1)

In [67]:
global_mean = df["IMDb Rating"].mean()

director_counts = df["director"].value_counts()

director_means = df.groupby("director")["IMDb Rating"].mean()

alpha = 10

smoothed_means = (director_means * director_counts + global_mean * alpha) / (
    director_counts + alpha
)

df["directors_encoded"] = df["director"].map(smoothed_means)

df = df.drop(["director"], axis=1)

In [68]:
df["Runtime (mins)"] = df["Runtime (mins)"].str.replace("min", "")

In [69]:
col_to_num = ["Year", "Runtime (mins)"]

for _ in col_to_num:
    df[_] = pd.to_numeric(df[_], errors="coerce")

In [71]:
df.to_csv("../processed/imdb_movies.csv", index=False)