## Setup

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

import utils_02 as utils

%load_ext autoreload
%autoreload 2

## 01 Data Loading and Inspection

In [17]:
ml = utils.MovieLens()

In [6]:
ml.users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
ml.ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
ml.movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
ml.data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [70]:
ml.data.shape

(1000209, 10)

## 02 Data Analysis

### 01 Mean movie ratings for each film grouped by gender

In [85]:
ml = utils.MovieLens()

In [86]:
# Compute mean movie ratings for each movie grouped by gender (ALL movies)
mean_ratings = ml.mean_ratings_by_gender().round(2)
mean_ratings.shape

(3706, 2)

In [87]:
# Compute active titles (at least 250 ratings per title)
active_titles = ml.get_active_titles(250)
active_titles.shape, active_titles[:10]

((1216,),
 MultiIndex([( 1,                   'Toy Story (1995)'),
             ( 2,                     'Jumanji (1995)'),
             ( 3,            'Grumpier Old Men (1995)'),
             ( 5, 'Father of the Bride Part II (1995)'),
             ( 6,                        'Heat (1995)'),
             ( 7,                     'Sabrina (1995)'),
             (10,                   'GoldenEye (1995)'),
             (11,     'American President, The (1995)'),
             (16,                      'Casino (1995)'),
             (17,       'Sense and Sensibility (1995)')],
            names=['movie_id', 'title']))

In [88]:
# Filter the mean ratings to only include active titles
mean_ratings_active = mean_ratings.loc[active_titles]
mean_ratings_active.shape

(1216, 2)

In [89]:
# The top films among female viewers
top_female_films = mean_ratings_active.sort_values(by='F', ascending=False).head()
top_female_films

Unnamed: 0_level_0,gender,F,M
movie_id,title,Unnamed: 2_level_1,Unnamed: 3_level_1
745,"Close Shave, A (1995)",4.64,4.47
1148,"Wrong Trousers, The (1993)",4.59,4.48
922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57,4.46
527,Schindler's List (1993),4.56,4.49
720,Wallace & Gromit: The Best of Aardman Animation (1996),4.56,4.39


### 02 Measuring Rating Disagreement

In [96]:
ml = utils.MovieLens()
mean_ratings = ml.mean_ratings_by_gender(active=True)
mean_ratings.round(2).head()    

Unnamed: 0_level_0,gender,F,M
movie_id,title,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),4.19,4.13
2,Jumanji (1995),3.28,3.18
3,Grumpier Old Men (1995),3.07,2.99
5,Father of the Bride Part II (1995),3.21,2.89
6,Heat (1995),3.68,3.91


In [98]:
# Add a column to mean_ratings containing the difference in means
mean_ratings[ml.DIFF] = mean_ratings[ml.M] - mean_ratings[ml.F]

In [101]:
mean_ratings.sort_values(ml.DIFF, ascending=True).round(2).head()

Unnamed: 0_level_0,gender,F,M,diff
movie_id,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1088,Dirty Dancing (1987),3.79,2.96,-0.83
2468,Jumpin' Jack Flash (1986),3.25,2.58,-0.68
1380,Grease (1978),3.98,3.37,-0.61
261,Little Women (1994),3.87,3.32,-0.55
3844,Steel Magnolias (1989),3.9,3.37,-0.54


In [102]:
mean_ratings.sort_values(ml.DIFF, ascending=False).round(2).head()

Unnamed: 0_level_0,gender,F,M,diff
movie_id,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1201,"Good, The Bad and The Ugly, The (1966)",3.49,4.22,0.73
3760,"Kentucky Fried Movie, The (1977)",2.88,3.56,0.68
231,Dumb & Dumber (1994),2.7,3.34,0.64
3062,"Longest Day, The (1962)",3.41,4.03,0.62
784,"Cable Guy, The (1996)",2.25,2.86,0.61


In [104]:
# Movies with the most disagreement among viewers, independent of gender identification
ml.get_titles_with_disagreement().round(2).head()

movie_id  title                                
231       Dumb & Dumber (1994)                     1.32
2710      Blair Witch Project, The (1999)          1.32
288       Natural Born Killers (1994)              1.31
327       Tank Girl (1995)                         1.28
2657      Rocky Horror Picture Show, The (1975)    1.26
Name: rating, dtype: float64

### 03 Group ratings by genre

In [119]:
ml = utils.MovieLens()

In [120]:
ml.data_by_genre.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation
2,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Children's
3,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Musical
4,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical


In [121]:
ml.data_by_genre.shape

(2101815, 10)

In [123]:
# Group by genre and age compute mean ratings
mean_ratings_by_genre_age = ml.data_by_genre.groupby([ml.GENRES, ml.AGE]).agg({ml.RATING: 'mean'})
mean_ratings_by_genre_age.round(2).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
genres,age,Unnamed: 2_level_1
Action,1,3.51
Action,18,3.45
Action,25,3.45
Action,35,3.54
Action,45,3.53
Action,50,3.61
Action,56,3.61
Adventure,1,3.45
Adventure,18,3.41
Adventure,25,3.44


In [126]:
mean_ratings_by_genre_age.unstack(level=1).round(2)

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating
age,1,18,25,35,45,50,56
genres,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Action,3.51,3.45,3.45,3.54,3.53,3.61,3.61
Adventure,3.45,3.41,3.44,3.52,3.53,3.63,3.65
Animation,3.48,3.62,3.7,3.74,3.73,3.78,3.76
Children's,3.24,3.29,3.43,3.52,3.53,3.56,3.62
Comedy,3.5,3.46,3.49,3.56,3.59,3.65,3.65
Crime,3.71,3.67,3.68,3.73,3.75,3.81,3.83
Documentary,3.73,3.87,3.95,3.95,3.97,3.91,3.96
Drama,3.79,3.72,3.73,3.78,3.78,3.88,3.93
Fantasy,3.32,3.35,3.45,3.48,3.53,3.58,3.53
Film-Noir,4.15,4.0,4.06,4.06,4.11,4.18,4.13
