In [3]:
import numpy as np
import pandas as pd

In [4]:
ratings = pd.read_table('data/ml-1m/ratings.dat', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], sep='::')
users = pd.read_table('data/ml-1m/users.dat', header=None, names=['UserID','Gender','Age','Occupation','Zip-code'], sep='::')
movies = pd.read_table('data/ml-1m/movies.dat', header=None, names=['MovieID', 'Title', 'Genres'], sep='::')

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
data = pd.merge(pd.merge(users, ratings), movies)
data.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [9]:
print(data[data.UserID == 1].head())

      UserID Gender  Age  Occupation Zip-code  MovieID  Rating  Timestamp  \
0          1      F    1          10    48067     1193       5  978300760   
1725       1      F    1          10    48067      661       3  978302109   
2250       1      F    1          10    48067      914       3  978301968   
2886       1      F    1          10    48067     3408       4  978300275   
4201       1      F    1          10    48067     2355       5  978824291   

                                       Title                        Genres  
0     One Flew Over the Cuckoo's Nest (1975)                         Drama  
1725        James and the Giant Peach (1996)  Animation|Children's|Musical  
2250                     My Fair Lady (1964)               Musical|Romance  
2886                  Erin Brockovich (2000)                         Drama  
4201                    Bug's Life, A (1998)   Animation|Children's|Comedy  


In [10]:
# 生成透视表
data_gender = data.pivot_table(values='Rating', index='Title', columns='Gender', aggfunc='mean')
data_gender.head()

Gender,F,M
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


In [39]:
data_gender['difference'] = data_gender.F - data_gender.M
data_gender_sorted = data_gender.sort_values(by='difference', ascending=False)
data_gender_sorted

Gender,F,M,difference
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"James Dean Story, The (1957)",4.000000,1.000000,3.000000
"Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919)",4.000000,1.000000,3.000000
Country Life (1994),5.000000,2.000000,3.000000
Babyfever (1994),3.666667,1.000000,2.666667
"Woman of Paris, A (1923)",5.000000,2.428571,2.571429
Cobra (1925),4.000000,1.500000,2.500000
"Other Side of Sunday, The (S鴑dagsengler) (1996)",5.000000,2.928571,2.071429
Theodore Rex (1995),3.000000,1.000000,2.000000
For the Moment (1994),5.000000,3.000000,2.000000
"Separation, The (La S閜aration) (1994)",4.000000,2.000000,2.000000


In [40]:
# 平均分
data_mean_rating = data.pivot_table(values='Rating', index='Title', aggfunc='mean')
data_mean_rating_sorted = data_mean_rating.sort_values(by='Rating', ascending=False)
data_mean_rating_sorted

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
Ulysses (Ulisse) (1954),5.000000
Lured (1947),5.000000
Follow the Bitch (1998),5.000000
Bittersweet Motel (2000),5.000000
Song of Freedom (1936),5.000000
One Little Indian (1973),5.000000
Smashing Time (1967),5.000000
Schlafes Bruder (Brother of Sleep) (1995),5.000000
"Gate of Heavenly Peace, The (1995)",5.000000
"Baby, The (1973)",5.000000


In [41]:
# 评论最热门
data_rating_num = data.groupby('Title').size()
data_rating_num_sorted = data_rating_num.sort_values(ascending=False)
data_rating_num_sorted

Title
American Beauty (1999)                                                   3428
Star Wars: Episode IV - A New Hope (1977)                                2991
Star Wars: Episode V - The Empire Strikes Back (1980)                    2990
Star Wars: Episode VI - Return of the Jedi (1983)                        2883
Jurassic Park (1993)                                                     2672
Saving Private Ryan (1998)                                               2653
Terminator 2: Judgment Day (1991)                                        2649
Matrix, The (1999)                                                       2590
Back to the Future (1985)                                                2583
Silence of the Lambs, The (1991)                                         2578
Men in Black (1997)                                                      2538
Raiders of the Lost Ark (1981)                                           2514
Fargo (1996)                                              

In [43]:
data_gender_hot = data_gender.loc[data_rating_num[data_rating_num > 1000].index] #加入评分次数限制
data_gender_hot.sort_values(by='difference', ascending=False)

Gender,F,M,difference
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Rocky Horror Picture Show, The (1975)",3.673016,3.160131,0.512885
Mary Poppins (1964),4.197740,3.730594,0.467147
Gone with the Wind (1939),4.269841,3.829371,0.440471
"Full Monty, The (1997)",4.113456,3.760976,0.352481
"Little Mermaid, The (1989)",3.975936,3.632375,0.343561
Pretty Woman (1990),3.846914,3.511700,0.335213
Thelma & Louise (1991),3.916268,3.581582,0.334686
Clueless (1995),3.827004,3.514640,0.312365
Ghost (1990),3.698667,3.395194,0.303473
Willy Wonka and the Chocolate Factory (1971),4.063953,3.789474,0.274480


In [45]:
data_mean_rating_number = data_mean_rating.loc[data_rating_num[data_rating_num > 1000].index]
data_mean_rating_number_sorted = data_mean_rating_number.sort_values(by='Rating', ascending=False)
data_mean_rating_number_sorted

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
"Shawshank Redemption, The (1994)",4.554558
"Godfather, The (1972)",4.524966
"Usual Suspects, The (1995)",4.517106
Schindler's List (1993),4.510417
Raiders of the Lost Ark (1981),4.477725
Rear Window (1954),4.476190
Star Wars: Episode IV - A New Hope (1977),4.453694
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963),4.449890
Casablanca (1942),4.412822
"Sixth Sense, The (1999)",4.406263
