### import pandas as pd

In [1]:
import pandas as pd

# Read in the data files into pandas tables

In [2]:
# Data is from https://grouplens.org/datasets/movielens/
# Use "MovieLens 1M Dataset"
unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] 
users = pd.read_table('/ml-1m/users.dat', sep='::', header=None, 
                      names=unames, engine='python')

In [24]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('/ml-1m/movies.dat', sep='::', header=None, encoding='latin1',
                        names=mnames, engine='python')

In [25]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('/ml-1m/ratings.dat', sep='::', header=None, 
                        names=rnames, engine='python')

In [26]:
users[:5]

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [27]:
ratings[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [28]:
movies[:5]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [29]:
ratings.size

4000836

# Creating a merged database

In [34]:
movies_ratings = pd.merge(movies, ratings)
movies_ratings[:5]

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [31]:
data = pd.merge(movies_ratings, users)
data[:5]

NameError: name 'movies_ratings' is not defined

# Start to explore the merged database

In [None]:
# Get average ratings of all movies and separate by gender
mean_ratings = data.pivot_table('rating', index=['title'],
                    columns='gender', aggfunc='mean')
mean_ratings[:5]

In [None]:
mean_ratings

In [None]:
mean_ratings2 = data.pivot_table('rating', index=['title', 'genres'],
                    columns='gender', aggfunc='mean')
mean_ratings2[:5]

In [None]:
num_ratings = data.groupby('title').size()
num_ratings[:10]

In [None]:
# meaningful ratings are when we have at least 250 people rate a movie
meaningful_ratings = num_ratings.index[num_ratings >= 250]
meaningful_ratings

In [None]:
meaningful_mean_ratings = mean_ratings.loc[meaningful_ratings]

In [None]:
meaningful_mean_ratings[:10]

In [None]:
top_female_ratings = meaningful_mean_ratings.sort_values(by='F', ascending=False)

In [None]:
top_female_ratings[:10]

In [None]:
top_male_ratings = meaningful_mean_ratings.sort_values(by='M', ascending=False)
top_male_ratings[:10]

In [None]:
meaningful_mean_ratings['diff'] = meaningful_mean_ratings['M'] - meaningful_mean_ratings['F']
sorted_by_diff = meaningful_mean_ratings.sort_values(by='diff')
sorted_by_diff[:10]

In [None]:
sorted_by_diff[-10:]

In [None]:
rating_std = data.groupby('title')['rating'].std()
rating_std = rating_std.loc[meaningful_ratings] # filter only meaningful ones
rating_std.sort_values(ascending=False)[:10]

In [None]:
rating_std.sort_values()[:10]