In [None]:
import pandas as pd
import numpy as np

ratings = pd.read_csv('ratings.csv')
print(ratings.head())
print(ratings.shape)
print(ratings.dtypes)
print(ratings.info())

movies = pd.read_csv('movies.csv')
print(movies.head())
print(movies.shape)
print(movies.dtypes)
print(movies.info())

print(ratings.isna().sum())
print(movies.isna().sum())

print(ratings.duplicated().sum())
print(movies.duplicated().sum())

print(ratings[(ratings['rating'] < 0.5) | (ratings['rating'] > 5.0)])
print(ratings[ratings['timestamp'] < 0])

data = pd.merge(ratings, movies, on='movieId')
print(data.head())
print(data.shape)
print(data.dtypes)
print(data.info())

comedy = data[data['genres'].str.contains('Comedy')]
print(comedy.head())

data['year'] = data['title'].str.extract('(\(\d{4}\))', expand=False)
data['year'] = data['year'].str.extract('(\d{4})', expand=False)
year_2015 = data[data['year'] == '2015']
print(year_2015.head())

mean_rating = data.groupby('movieId')['rating'].mean()
high_rating = mean_rating[mean_rating > 4]
print(high_rating.head())

stats = data.groupby('movieId')['rating'].agg(['count', 'mean', 'std'])
stats.columns = ['num_ratings', 'avg_rating', 'std_rating']
print(stats.head())

genres = data['genres'].str.get_dummies(sep='|')
data_genres = pd.concat([data, genres], axis=1)
genres_count = data_genres.groupby('movieId')[genres.columns].sum()
genres_count = genres_count[genres_count.sum(axis=1) > 0]
stats_genres = pd.concat([stats, genres_count], axis=1)
stats_by_genres = stats_genres.groupby(genres.columns).mean()
print(stats_by_genres)

data['datetime'] = pd.to_datetime(data['timestamp'], unit='s')
print(data.head())
data['month'] = data['datetime'].dt.month
ratings_by_month = data.groupby('month')['rating'].mean()
print(ratings_by_month)

