# MoviesLens data set
# Question: How do men and women rate the same movie?

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

### Read three data files

In [2]:
# pass in column names for each CSV
u_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,
                    encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

# u.item has a large number of fields.  Lets use only two.
# Note the use of 'usecols'

m_cols = ['movie_id', 'title']
movies = pd.read_csv('u.item', sep='|', names=m_cols, usecols=range(2),
                     encoding='latin-1')

In [3]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
# create one merged DataFrame
movie_ratings = pd.merge(movies, ratings)
data = pd.merge(movie_ratings, users)

Always good to start off by examining the data

In [7]:
data.head()

Unnamed: 0,movie_id,title,user_id,rating,unix_timestamp,age,gender,occupation,zip_code
0,1,Toy Story (1995),308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),308,5,887737890,60,M,retired,95076
2,5,Copycat (1995),308,4,887739608,60,M,retired,95076
3,7,Twelve Monkeys (1995),308,4,887738847,60,M,retired,95076
4,8,Babe (1995),308,5,887736696,60,M,retired,95076


In [8]:
data.head().T

Unnamed: 0,0,1,2,3,4
movie_id,1,4,5,7,8
title,Toy Story (1995),Get Shorty (1995),Copycat (1995),Twelve Monkeys (1995),Babe (1995)
user_id,308,308,308,308,308
rating,4,5,4,4,5
unix_timestamp,887736532,887737890,887739608,887738847,887736696
age,60,60,60,60,60
gender,M,M,M,M,M
occupation,retired,retired,retired,retired,retired
zip_code,95076,95076,95076,95076,95076


### [Q] What are the top 10 movies that have the most number of ratings?

#### Approach 1: 
- Form groups based on title
- find the size of each group
- sort the resultant Series with .sort_values

In [9]:
g=data.groupby('title')
g.get_group("'Til There Was You (1997)")




Unnamed: 0,movie_id,title,user_id,rating,unix_timestamp,age,gender,occupation,zip_code
3533,1300,'Til There Was You (1997),223,1,891550470,19,F,student,47906
20920,1300,'Til There Was You (1997),416,3,886315494,20,F,student,92626
26074,1300,'Til There Was You (1997),299,2,877878382,29,M,doctor,63108
27955,1300,'Til There Was You (1997),532,3,888632446,20,M,student,92705
34521,1300,'Til There Was You (1997),178,3,886678518,26,M,other,49512
72303,1300,'Til There Was You (1997),342,1,875318556,25,F,other,98006
86970,1300,'Til There Was You (1997),152,4,886535827,33,F,educator,68767
94514,1300,'Til There Was You (1997),530,2,890627207,29,M,engineer,94040
95199,1300,'Til There Was You (1997),782,2,891499469,21,F,artist,33205


In [10]:
data.groupby('title').size().sort_values(ascending=False).head(10)

title
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
dtype: int64

#### Approach 2:  Use Pandas' .value_counts :-)

In [11]:
len(data), len(movies)

(100000, 1682)

In [12]:
data['title'].value_counts().head(10)



Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
Name: title, dtype: int64

### [Q] What is the average rating for each movie?  Which movies are rated the highest? 

In [13]:
data.groupby('title').apply(lambda df: df['rating'].mean())




title
'Til There Was You (1997)                                   2.333333
1-900 (1994)                                                2.600000
101 Dalmatians (1996)                                       2.908257
12 Angry Men (1957)                                         4.344000
187 (1997)                                                  3.024390
2 Days in the Valley (1996)                                 3.225806
20,000 Leagues Under the Sea (1954)                         3.500000
2001: A Space Odyssey (1968)                                3.969112
3 Ninjas: High Noon At Mega Mountain (1998)                 1.000000
39 Steps, The (1935)                                        4.050847
8 1/2 (1963)                                                3.815789
8 Heads in a Duffel Bag (1997)                              3.250000
8 Seconds (1994)                                            3.750000
A Chef in Love (1996)                                       4.125000
A koldum klaka (Cold Fever) 

In [14]:
# data.groupby('title').apply(lambda df: df['rating'].mean())

data.groupby('title')['rating'].mean()


title
'Til There Was You (1997)                                   2.333333
1-900 (1994)                                                2.600000
101 Dalmatians (1996)                                       2.908257
12 Angry Men (1957)                                         4.344000
187 (1997)                                                  3.024390
2 Days in the Valley (1996)                                 3.225806
20,000 Leagues Under the Sea (1954)                         3.500000
2001: A Space Odyssey (1968)                                3.969112
3 Ninjas: High Noon At Mega Mountain (1998)                 1.000000
39 Steps, The (1935)                                        4.050847
8 1/2 (1963)                                                3.815789
8 Heads in a Duffel Bag (1997)                              3.250000
8 Seconds (1994)                                            3.750000
A Chef in Love (1996)                                       4.125000
A koldum klaka (Cold Fever) 

### [Q] Perform the same calculation, but this time only for movies that have been rated at least 100 times

In [None]:
s1=data.groupby('title').apply(lambda df: (len(df), df['rating'].mean()) )
s1.head()
bm=s1.apply(lambda v: v[0]>=100)


### Determine the 50 movies that have been reviewed the most
(we've already done it)

In [None]:
top50 = data['title'].value_counts().head(50)




## Pivot Tables

In [None]:
len(data['title'].value_counts())

In [None]:
len(data)

In [None]:
pvt = data.pivot_table(index='title', 
                       columns='gender', values='rating')
pvt.head()





In [None]:
len(pvt)

### Find X to plot the difference in average reviews between men and women

In [None]:
rev = pvt.loc[top50.index]

In [None]:
rev['diff'] = rev['M']-rev['F']
rev.head()
X = rev['diff']

In [None]:
X.sort_values().plot(kind='barh',figsize=[9,15])