In [1]:
import pandas as pd
import numpy as np

In [2]:
rating_address_template = "dataset/raw/binge/rating_x.csv"
NUMBER_OF_RATING_ADDRESSES = 4

In [3]:
def get_cleaned_ratings() -> pd.DataFrame:
    rating_columns = ["movie_id", "user_id", "datetime", "rating"]

    rating_dfs = [
        pd.read_csv(address, names=rating_columns)
        for address in (
            rating_address_template.replace("x", str(i))
            for i in range(1, NUMBER_OF_RATING_ADDRESSES)
        )
    ]

    return pd.concat(rating_dfs).drop(columns="datetime")

In [4]:
ratings_df = get_cleaned_ratings()

In [5]:
ratings_df

Unnamed: 0,movie_id,user_id,rating
0,1,1488844,3
1,1,822109,5
2,1,885013,4
3,1,30878,4
4,1,823519,3
...,...,...,...
22601624,13367,2339129,4
22601625,13367,59005,4
22601626,13367,1789683,5
22601627,13367,1878798,1


In [6]:
def replace_ratings_with_median_ratings(ratings_df: pd.DataFrame) -> pd.DataFrame:
    return (
        ratings_df.groupby(by="movie_id")[["rating"]]
        .median()
        .reset_index()
        .merge(ratings_df, how="inner", left_on="movie_id", right_on="movie_id")
        .drop(columns="rating_y")
        .rename(columns={"rating_x": "median_rating"})
    )

In [7]:
ratings_with_median = replace_ratings_with_median_ratings(ratings_df)

In [8]:
def get_rating_counts(ratings_with_median: pd.DataFrame) -> pd.DataFrame:
    return (
        ratings_with_median.groupby(by="movie_id")["median_rating"]
        .count()
        .reset_index()
    )

In [9]:
ratings_with_count = get_rating_counts(ratings_with_median)

In [10]:
columns_mapping = {
    "median_rating_x": "median_rating",
    "median_rating_y": "count_of_review",
}

rating_final_df = ratings_with_median.merge(
    ratings_with_count, how="inner", left_on="movie_id", right_on="movie_id"
).rename(columns=columns_mapping)

In [11]:
rating_final_df

Unnamed: 0,movie_id,median_rating,user_id,count_of_review
0,1,4.0,1488844,547
1,1,4.0,822109,547
2,1,4.0,885013,547
3,1,4.0,30878,547
4,1,4.0,823519,547
...,...,...,...,...
73632979,13367,3.0,2339129,101
73632980,13367,3.0,59005,101
73632981,13367,3.0,1789683,101
73632982,13367,3.0,1878798,101


In [12]:
rating_final_df["score"] = (
    rating_final_df["median_rating"] * rating_final_df["count_of_review"]
)

In [13]:
rating_final_df

Unnamed: 0,movie_id,median_rating,user_id,count_of_review,score
0,1,4.0,1488844,547,2188.0
1,1,4.0,822109,547,2188.0
2,1,4.0,885013,547,2188.0
3,1,4.0,30878,547,2188.0
4,1,4.0,823519,547,2188.0
...,...,...,...,...,...
73632979,13367,3.0,2339129,101,303.0
73632980,13367,3.0,59005,101,303.0
73632981,13367,3.0,1789683,101,303.0
73632982,13367,3.0,1878798,101,303.0


In [14]:
movie_titles_address = "dataset/raw/binge/movie_titles.csv"
movie_titles_columns = ("movie_id", "year_of_publication", "movie_name")

movie_titles = pd.read_csv(
    movie_titles_address,
    encoding="ISO-8859-1",
    usecols=[0, 1, 2],
    names=movie_titles_columns,
)

In [15]:
movie_titles_final = movie_titles[~(movie_titles["year_of_publication"].isna())]

In [16]:
df = rating_final_df.merge(
    movie_titles_final, how="inner", left_on="movie_id", right_on="movie_id"
)

In [17]:
df = df.astype({col: "int32" for col in df.select_dtypes("int64").columns})
df = df.astype({col: "float32" for col in df.select_dtypes("float64").columns})

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73632412 entries, 0 to 73632411
Data columns (total 7 columns):
 #   Column               Dtype  
---  ------               -----  
 0   movie_id             int32  
 1   median_rating        float32
 2   user_id              int32  
 3   count_of_review      int32  
 4   score                float32
 5   year_of_publication  float32
 6   movie_name           object 
dtypes: float32(3), int32(3), object(1)
memory usage: 2.2+ GB


In [19]:
def get_best_scored_movies(df, n_movies=5) -> np.array:
    return df.sort_values(by="score", ascending=False)["movie_name"].unique()[:n_movies]

In [20]:
get_best_scored_movies(df)

array(['Forrest Gump',
       'Pirates of the Caribbean: The Curse of the Black Pearl',
       'Pretty Woman', 'Lord of the Rings: The Two Towers',
       'Lord of the Rings: The Fellowship of the Ring'], dtype=object)