In [7]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
config = {
    "rating_address_template": "dataset/raw/binge/rating_x.csv",
    "number_of_rating_addresses": 4,
    "movie_titles_address": "dataset/raw/binge/movie_titles.csv",
}

In [3]:
class BingeDataset:
    def __init__(self, rating_address_template: str, number_of_rating_addresses: int, movie_titles_address: str):
        self.rating_address_template = rating_address_template
        self.NUMBER_OF_RATING_ADDRESSES = number_of_rating_addresses
        self.movie_titles_address = movie_titles_address

    def get_df(self) -> pd.DataFrame:
        ratings_df = self.get_cleaned_ratings()
        ratings_with_median = self.replace_ratings_with_median_ratings(ratings_df)
        ratings_with_count = self.get_rating_counts(ratings_with_median)
        rating_final_df = self.merge_median_and_count_ratings(ratings_with_median, ratings_with_count)

        rating_final_df["score"] = (
            rating_final_df["median_rating"] * rating_final_df["count_of_review"]
        )

        movie_titles_final = self.get_movie_titles_df()

        df = rating_final_df.merge(
            movie_titles_final,
            how="inner",
            left_on="movie_id",
            right_on="movie_id"
        )

        df = self.change_types(df)

        return df

    def change_types(self, df: pd.DataFrame):
        return (
            df
            .astype({"year_of_publication": "int32"})
            .astype({col: "int32" for col in df.select_dtypes("int64").columns})
            .astype({col: "float32" for col in df.select_dtypes("float64").columns})
        )

    def merge_median_and_count_ratings(
        self,
        ratings_with_median: pd.DataFrame,
        ratings_with_count: pd.DataFrame
    ) -> pd.DataFrame:
        columns_mapping = {
            "median_rating_x": "median_rating",
            "median_rating_y": "count_of_review",
        }

        return (
            ratings_with_median
            .merge(ratings_with_count, how="inner", left_on="movie_id", right_on="movie_id")
            .rename(columns=columns_mapping)
        )

    def get_movie_titles_df(self) -> pd.DataFrame:

        movie_titles_columns = ("movie_id", "year_of_publication", "movie_name")

        movie_titles = pd.read_csv(
            self.movie_titles_address,
            encoding="ISO-8859-1",
            usecols=[0, 1, 2],
            names=movie_titles_columns,
        )

        return movie_titles[~(movie_titles["year_of_publication"].isna())]

    def get_rating_counts(self, ratings_with_median: pd.DataFrame) -> pd.DataFrame:
        return (
            ratings_with_median.groupby(by="movie_id")["median_rating"]
            .count()
            .reset_index()
        )

    def replace_ratings_with_median_ratings(
        self, ratings_df: pd.DataFrame
    ) -> pd.DataFrame:
        return (
            ratings_df.groupby(by="movie_id")[["rating"]]
            .median()
            .reset_index()
            .merge(ratings_df, how="inner", left_on="movie_id", right_on="movie_id")
            .drop(columns="rating_y")
            .rename(columns={"rating_x": "median_rating"})
        )

    def get_cleaned_ratings(self) -> pd.DataFrame:
        rating_columns = ["movie_id", "user_id", "datetime", "rating"]

        rating_dfs = [
            pd.read_csv(address, names=rating_columns)
            for address in (
                self.rating_address_template.replace("x", str(i))
                for i in range(1, self.NUMBER_OF_RATING_ADDRESSES)
            )
        ]

        return pd.concat(rating_dfs).drop(columns="datetime")

In [4]:
binge_dataset = BingeDataset(**config)

In [5]:
df = binge_dataset.get_df()

In [6]:
df

Unnamed: 0,movie_id,median_rating,user_id,count_of_review,score,year_of_publication,movie_name
0,1,4.0,1488844,547,2188.0,2003.0,Dinosaur Planet
1,1,4.0,822109,547,2188.0,2003.0,Dinosaur Planet
2,1,4.0,885013,547,2188.0,2003.0,Dinosaur Planet
3,1,4.0,30878,547,2188.0,2003.0,Dinosaur Planet
4,1,4.0,823519,547,2188.0,2003.0,Dinosaur Planet
...,...,...,...,...,...,...,...
73632407,13367,3.0,2339129,101,303.0,2002.0,The Mummy: Quest for the Lost Scrolls
73632408,13367,3.0,59005,101,303.0,2002.0,The Mummy: Quest for the Lost Scrolls
73632409,13367,3.0,1789683,101,303.0,2002.0,The Mummy: Quest for the Lost Scrolls
73632410,13367,3.0,1878798,101,303.0,2002.0,The Mummy: Quest for the Lost Scrolls


In [44]:
def compare_movies(df: pd.DataFrame, movie_one_id: int, movie_two_id: int):
    current_year = datetime.now().year
    movie_one_name = df.loc[df["movie_id"] == movie_one_id, "movie_name"].iloc[0]
    movie_one_score = df.loc[df["movie_id"] == movie_one_id, "score"].iloc[0]
    movie_one_age = current_year - df.loc[df["movie_id"] == movie_one_id, "year_of_publication"].iloc[0]

    movie_two_name = df.loc[df["movie_id"] == movie_two_id, "movie_name"].iloc[0]
    movie_two_score = df.loc[df["movie_id"] == movie_two_id, "score"].iloc[0]
    movie_two_age = current_year - df.loc[df["movie_id"] == movie_two_id, "year_of_publication"].iloc[0]

    line_break = "-" * 100

    print(
        (
            f"{movie_one_name} is a{movie_one_age: .0f} old movie with the score of{movie_one_score: .0f}.\n" +
            line_break + "\n"
            f"{movie_two_name} is a{movie_two_age: .0f} old movie with the score of{movie_two_score: .0f}.\n"
        )
    )

In [45]:
compare_movies(df, 11, 38)

Full Frame: Documentary Shorts is a 26 old movie with the score of 594.
----------------------------------------------------------------------------------------------------
Daydream Obsession is a 22 old movie with the score of 1604.

