In [1]:
import pandas as pd

In [2]:
df_rating = pd.read_csv("res/ratings.csv")

In [3]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
df_newRatings = df_rating.loc[:, df_rating.columns != "timestamp"]

In [14]:
df_newRatings[df_newRatings["movieId"] == 1].describe()

Unnamed: 0,userId,movieId,rating
count,215.0,215.0,215.0
mean,306.530233,1.0,3.92093
std,180.419754,0.0,0.834859
min,1.0,1.0,0.5
25%,155.5,1.0,3.5
50%,290.0,1.0,4.0
75%,468.5,1.0,4.5
max,610.0,1.0,5.0


In [18]:
df_newRatings[df_newRatings["movieId"] == 1].rating.mean()

3.9209302325581397

In [11]:
df_newRatings["movieId"].head()

0     1
1     3
2     6
3    47
4    50
Name: movieId, dtype: int64

In [35]:
# Slow version but get the last item that has 5 rating
def get_highest_rating_movie():
    avg_rating = {}
    for i in df_newRatings["movieId"]:
        curr_rating = df_newRatings[df_newRatings["movieId"] == i].rating.mean()
        avg_rating[i] = curr_rating
    max_rating = max(zip(avg_rating.values(), avg_rating.keys()))[1]
    return max_rating, avg_rating[max_rating]

In [25]:
mov_id, rate = get_highest_rating_movie()

In [26]:
print("Max rating: ", mov_id)
print("Rating: ", rate)

Max rating:  187717
Rating:  5.0


In [27]:
# Fast version but get the first movie with 5 avg rating
def get_highest_rating_movie(df_newRatings):
    avg_rating = df_newRatings.groupby("movieId")["rating"].mean()
    
    max_rating_movie_id = avg_rating.idxmax()
    max_avg_rating = avg_rating[max_rating_movie_id]
    
    return max_rating_movie_id, max_avg_rating

In [28]:
movie_id, avg_rating = get_highest_rating_movie(df_newRatings)

In [30]:
print(f"Highest rated movie (ID {movie_id}): {avg_rating:.2f}")

Highest rated movie (ID 53): 5.00


In [34]:
def get_top_rated_movies(df_newRatings, threshold=5.0):
    avg_rating = df_newRatings.groupby("movieId")["rating"].mean()
    
    top_rated_movies = avg_rating[avg_rating >= threshold]
    
    top_rated_movie_ids = top_rated_movies.index.tolist()
    
    return top_rated_movie_ids

In [32]:
top_rated_ids = get_top_rated_movies(df_newRatings)

In [36]:
print(f"Movie IDs with an average rating of 5 or higher: {top_rated_ids}")

Movie IDs with an average rating of 5 or higher: [53, 99, 148, 467, 495, 496, 626, 633, 876, 1140, 1151, 1310, 1349, 1631, 1759, 2075, 2196, 2512, 2824, 2969, 2972, 3073, 3086, 3096, 3303, 3473, 3496, 3531, 3567, 3637, 3678, 3687, 3792, 3795, 3851, 3939, 3940, 3941, 3942, 3951, 4116, 4135, 4180, 4402, 4454, 4495, 4788, 4813, 5059, 5088, 5241, 5244, 5328, 5416, 5468, 5490, 5513, 5537, 5607, 5723, 5745, 5746, 5888, 5889, 6021, 6086, 6192, 6201, 6402, 6408, 6442, 6611, 6818, 6835, 6983, 7071, 7096, 7122, 7815, 8238, 8580, 8738, 8804, 8911, 25887, 25906, 25947, 26073, 26078, 26147, 26169, 26350, 26366, 26401, 26587, 26840, 26849, 26928, 27320, 27373, 27523, 27704, 27751, 31522, 33138, 33649, 34312, 40491, 42556, 44851, 44943, 45503, 47736, 50999, 53280, 53355, 53578, 59814, 60737, 64499, 64501, 67618, 69211, 69469, 69860, 70451, 71268, 72142, 72692, 73822, 74226, 76091, 77846, 78836, 79897, 80124, 82744, 83969, 84273, 84512, 85295, 86237, 86668, 86721, 87834, 88448, 90943, 91355, 91386, 92