# MoviesLens의 영화 평점 데이터

- MoviesLens 사용자로부터 수집한 방대한 영화 평점 데이터
- 데이터 구성
    - 영화 평점
    - 영화에 대한 정보(장르, 개봉 년도)
    - 사용자 정보 (나이, 우편번호, 성별, 직업

In [1]:
import pandas as pd

# 출력되는 내용을 줄임
pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('./pydata-book-2nd-edition/datasets/movielens/users.dat', sep='::'
                     , header=None, names=unames, engine='python')

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('./pydata-book-2nd-edition/datasets/movielens/ratings.dat', sep='::'
                     , header=None, names=rnames, engine='python')

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./pydata-book-2nd-edition/datasets/movielens/movies.dat', sep='::'
                     , header=None, names=mnames, engine='python')

In [2]:
users[:5]

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [3]:
ratings[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
movies[:5]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 1.나이와 성별에 따른 어떤 영화의 평균 평점 계산

### 1.1 테이블 병합
- ratings과 users 테이블을 병합하고 그 결과를 다시 movies 테이블과 병합

In [5]:
data = pd.merge(pd.merge(ratings, users), movies)
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [6]:
data.iloc[0]

user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

### 1.2 성별에 따른 각 영화의 평균 평점을 구하기

In [7]:
mean_ratings = data.pivot_table('rating', index='title',
                               columns='gender', aggfunc='mean')
mean_ratings.head()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


In [8]:
mean_ratings.loc[mean_ratings['M'].isnull()]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Aiqing wansui (1994),3.00,
"Alley Cats, The (1968)",4.00,
Anna (1996),4.00,
Bittersweet Motel (2000),5.00,
Chain of Fools (2000),3.00,
...,...,...
Soft Toilet Seats (1999),3.00,
Song of Freedom (1936),5.00,
Talk of Angels (1998),2.50,
Target (1995),4.00,


### 1.3 250건 이상의 평점 정보가 있는 영화 추리기

#### 1.3.1 제목별 평점 정보 건수 구하기
- 데이터를 영화 제목으로 그룹화하고 size 함수를 사용해 제목별 평점 정보 건수 구함

In [9]:
ratings_by_title = data.groupby('title').size()
ratings_by_title.head()

title
$1,000,000 Duck (1971)            37
'Night Mother (1986)              70
'Til There Was You (1997)         52
'burbs, The (1989)               303
...And Justice for All (1979)    199
dtype: int64

#### 1.3.2 평점 정보 건수가 250 이상인 영화에 대한 색인 구하기
- mean_ratings에서 항목을 선택하기 위함

In [10]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)

#### 1.3.3 250건 이상의 평점 정보가 있는 영화 추리기

In [11]:
# 영화 색인으로 로우 선택
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.500000
101 Dalmatians (1996),3.240000,2.911215
12 Angry Men (1957),4.184397,4.328421
...,...,...
Young Guns (1988),3.371795,3.425620
Young Guns II (1990),2.934783,2.904025
Young Sherlock Holmes (1985),3.514706,3.363344
Zero Effect (1998),3.864407,3.723140


### 1.4 여성에게 높은 평점을 받은 영화 목록 확인
- 여성에게 높은 평점을 받은 영화 목록을 확인하기 위해 F 컬럼을 내림차순

In [12]:
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings.head()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
Schindler's List (1993),4.562602,4.491415


### 1.5 남녀 평점 차이 구하기
- 남녀 간의 호불호가 갈리는 영화를 찾기 위해 mean_ratings에 평균 평점의 차이를 담을 수 있는 컬럼을 추가하고, 그 컬럼을 기준으로 정렬
- diff로 정렬하면 여성들이 더 선호하는 영화 순

In [13]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by='diff')

# 여자가 선호하는 영화 10개
sorted_by_diff[:10]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,-0.830782
Jumpin' Jack Flash (1986),3.254717,2.578358,-0.676359
Grease (1978),3.975265,3.367041,-0.608224
Little Women (1994),3.870588,3.321739,-0.548849
Steel Magnolias (1989),3.901734,3.365957,-0.535777
Anastasia (1997),3.8,3.281609,-0.518391
"Rocky Horror Picture Show, The (1975)",3.673016,3.160131,-0.512885
"Color Purple, The (1985)",4.158192,3.659341,-0.498851
"Age of Innocence, The (1993)",3.827068,3.339506,-0.487561
Free Willy (1993),2.921348,2.438776,-0.482573


In [14]:
# 뒤에서 10개의 로우 선택
sorted_by_diff[::-1][:10]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,0.726351
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,0.676359
Dumb & Dumber (1994),2.697987,3.336595,0.638608
"Longest Day, The (1962)",3.411765,4.031447,0.619682
"Cable Guy, The (1996)",2.25,2.863787,0.613787
Evil Dead II (Dead By Dawn) (1987),3.297297,3.909283,0.611985
"Hidden, The (1987)",3.137931,3.745098,0.607167
Rocky III (1982),2.361702,2.943503,0.581801
Caddyshack (1980),3.396135,3.969737,0.573602
For a Few Dollars More (1965),3.409091,3.953795,0.544704


## 2. 성별의 관계없이 영화에 대한 호불호가 극명하게 나뉘는 영화 찾기
- 호불호는 평점의 분산이나 표준편차로 측정할 수 있다

In [15]:
# 영화별 평점 표준편차
rating_std_by_title = data.groupby('title')['rating'].std()

# activate_titles만 선택
rating_std_by_title = rating_std_by_title.loc[active_titles]

# 평점 내림차순으로 Serise 정렬
rating_std_by_title.sort_values(ascending=False)[:10]

title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64