In [10]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import re
import matplotlib.pyplot as plt

movies = pd.read_csv('movie_data/movies.csv')
ratings = pd.read_csv('movie_data/ratings.csv')

def get_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match: 
        return int(match.group(1))
    return None

#adding a year column to the movies dataframe
movies['year']= movies['title'].apply(get_year)

#don't want any decimal ending years
movies['year'] = movies['year'].astype('Int64')

#creating subset with movies since 2020
recent_movies = movies[movies['year'] >= 2020]
recent_movie_ratings = ratings[ratings['movieId'].isin(recent_movies['movieId'])]

#no null values found when doing print statements:
    #print(recent_movies.isnull().sum())
    #print(recent_movie_ratings.isnull().sum())

#checking for duplicates- none found
duplicates = recent_movies[recent_movies.duplicated(subset='movieId')]
duplicate_ratings = recent_movie_ratings[recent_movie_ratings.duplicated(subset=['userId','movieId'])]

#removing movies that have 'no genres listed' as their only genre
recent_movies = recent_movies[recent_movies['genres'] != '(no genres listed)']
recent_movie_ratings = recent_movie_ratings[recent_movie_ratings['movieId'].isin(recent_movies['movieId'])]

#Genres by count (# of movies in each genre)
genres = recent_movies['genres'].str.split('|').explode()
genre_counts = genres.value_counts()


movieId  userId  rating  timestamp 
209311   2476    4.0     1665246428    1
         3514    4.5     1677158561    1
         4340    3.0     1644881521    1
         4392    3.0     1654015956    1
         4552    3.0     1644879183    1
Name: count, dtype: int64

In [14]:
movie_ratings_count = recent_movie_ratings.groupby('movieId')['rating'].count().sort_values(ascending=False).head(5)

ratings_titles = recent_movies.set_index('movieId').loc[movie_ratings_count.index,'title']
display(movie_ratings_count)
display(ratings_titles)

movieId
254726    5332
225173    4486
217465    4332
270698    3989
263007    3971
Name: rating, dtype: int64

movieId
254726                                 Dune (2021)
225173                                 Soul (2020)
217465                                Tenet (2020)
270698    Everything Everywhere All at Once (2022)
263007              Spider-Man: No Way Home (2021)
Name: title, dtype: object