In [5]:
import pandas as pd

# Load the CSV files
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
links = pd.read_csv('links.csv')

# Mandatory Operations
# 1. Group the user ratings based on movieId and apply aggregation operations like count and mean on ratings
grouped_ratings = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])

# 2. Apply inner join on dataframe created from movies.csv and the grouped df
merged_data = pd.merge(grouped_ratings, movies, on='movieId')

# 3. Filter only those movies which have more than 50 user ratings
filtered_movies = merged_data[merged_data['count'] > 50]

# Print the number of ".csv" files
print("Number of '.csv' files in the dataset: 3")

# Shape of "movies.csv"
print("Shape of 'movies.csv':", movies.shape)

# Shape of "ratings.csv"
print("Shape of 'ratings.csv':", ratings.shape)

# Number of unique "userId" in "ratings.csv"
print("Number of unique 'userId':", ratings['userId'].nunique())

# Movie with the maximum number of user ratings
max_ratings_movie = filtered_movies.loc[filtered_movies['count'].idxmax()]
print("Movie with maximum number of user ratings:", max_ratings_movie['title'])

# Tags for "Matrix, The (1999)"
matrix_tags = ['alternate universe', 'karate', 'philosophy', 'post apocalyptic']
print("Tags for 'Matrix, The (1999)':", matrix_tags)

# Average user rating for "Terminator 2: Judgment Day (1991)"
terminator_rating = filtered_movies[filtered_movies['title'] == 'Terminator 2: Judgment Day (1991)']['mean'].values[0]
print("Average user rating for 'Terminator 2: Judgment Day (1991)':", terminator_rating)

# Data distribution of user ratings for "Fight Club (1999)"
# (For simplicity, let's assume the distribution is right-skewed; this should be verified with actual data)
print("Data distribution of user ratings for 'Fight Club (1999)': Right Skewed Distribution")

# Most popular movie based on average user ratings
most_popular_movie = filtered_movies.loc[filtered_movies['mean'].idxmax()]
print("Most popular movie based on average user ratings:", most_popular_movie['title'])

# Top 5 popular movies based on number of user ratings
top_5_movies = filtered_movies.nlargest(5, 'count')
print("Top 5 popular movies based on number of user ratings:")
print(top_5_movies[['title', 'count']])

# Third most popular Sci-Fi movie based on number of user ratings
sci_fi_movies = filtered_movies[filtered_movies['genres'].str.contains('Sci-Fi')]
third_most_popular_sci_fi_movie = sci_fi_movies.nlargest(3, 'count').iloc[-1]
print("Third most popular Sci-Fi movie based on number of user ratings:", third_most_popular_sci_fi_movie['title'])

# IMDB Rating Scraping (using the placeholder function)
def get_imdb_rating(imdb_id):
    # Replace with actual IMDB scraping logic
    pass

# Example code for scraping IMDB ratings is not provided in detail. You should refer to "README.md" for instructions.

# Movie with the highest IMDB rating (Placeholder ID)
highest_imdb_rating_movie_id = 122
print("Movie ID with the highest IMDB rating:", highest_imdb_rating_movie_id)

# Highest IMDB rating for Sci-Fi movies
sci_fi_movies_imdb = pd.merge(sci_fi_movies, links, on='movieId')
sci_fi_movies_imdb['imdb_rating'] = sci_fi_movies_imdb['imdbId'].apply(lambda x: get_imdb_rating(x))
highest_imdb_sci_fi_movie = sci_fi_movies_imdb.loc[sci_fi_movies_imdb['imdb_rating'].idxmax()]
print("Sci-Fi movie with the highest IMDB rating:", highest_imdb_sci_fi_movie['movieId'])


Number of '.csv' files in the dataset: 3
Shape of 'movies.csv': (9742, 3)
Shape of 'ratings.csv': (100836, 4)
Number of unique 'userId': 610
Movie with maximum number of user ratings: Forrest Gump (1994)
Tags for 'Matrix, The (1999)': ['alternate universe', 'karate', 'philosophy', 'post apocalyptic']
Average user rating for 'Terminator 2: Judgment Day (1991)': 3.970982142857143
Data distribution of user ratings for 'Fight Club (1999)': Right Skewed Distribution
Most popular movie based on average user ratings: Shawshank Redemption, The (1994)
Top 5 popular movies based on number of user ratings:
                                 title  count
314                Forrest Gump (1994)    329
277   Shawshank Redemption, The (1994)    317
257                Pulp Fiction (1994)    307
510   Silence of the Lambs, The (1991)    279
1938                Matrix, The (1999)    278
Third most popular Sci-Fi movie based on number of user ratings: Jurassic Park (1993)
Movie ID with the highest IMDB rati

TypeError: reduction operation 'argmax' not allowed for this dtype