### Pandas를 활용하여 영화 평점 데이터 분석하기

- 나이 별 영화 평점 분석하기
- 실습 참조 : 
    - http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/

### 데이터 읽기

- 해당 데이터셋은 평점, 사용자 정보, 영화 정보 테이블로 구성되어있어서 각 파일을 따로 읽어와야 한다.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 지난 시간에 로드한 데이터를 다시 활용
users = pd.read_table('../misc/data/ml-1m/users.dat',\
                      sep='::', \
                      header=None, \
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])

movies = pd.read_table('../misc/data/ml-1m/movies.dat',\
                      sep='::',\
                      header=None,\
                      names=['movie_id', 'title', 'genres'])

ratings = pd.read_table('../misc/data/ml-1m/ratings.dat',\
                       sep='::',\
                       header=None,\
                       names=['user_id', 'movie_id', 'rating', 'timestamp'])

  
  after removing the cwd from sys.path.
  


In [3]:
# 하나의 merge 된 데이터프레임 구성
movie_ratings = pd.merge(movies, ratings)
data = pd.merge(movie_ratings, users)

data.head()

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp,gender,age,occupation,zip
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268,F,1,10,48067
1,48,Pocahontas (1995),Animation|Children's|Musical|Romance,1,5,978824351,F,1,10,48067
2,150,Apollo 13 (1995),Drama,1,5,978301777,F,1,10,48067
3,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,1,4,978300760,F,1,10,48067
4,527,Schindler's List (1993),Drama|War,1,5,978824195,F,1,10,48067


## 평점이 가장 많은 영화 20개 추려보기

In [4]:
# groupby
# size - 각 그룹의 갯수를 확인
# sort_values - 사이즈 값을 기준으로 sort
data.groupby(data.title)\
    .size()\
    .sort_values(ascending=False)[:20]

title
American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
Men in Black (1997)                                      2538
Raiders of the Lost Ark (1981)                           2514
Fargo (1996)                                             2513
Sixth Sense, The (1999)                                  2459
Braveheart (1995)                                        2443
Shakespeare in Love (1998)                               2369
Pr

### 가장 높은 평점을 받은 영화 20개 추려보기

In [5]:
# 원하는 형태로 데이터를 구성하기, 전체 평점 사이즈와 평균 값을 알고 싶다.
best_movies = data.groupby(data.title)\
                    .agg({'rating':[np.size, np.mean]})
best_movies[:20]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",37,3.027027
'Night Mother (1986),70,3.371429
'Til There Was You (1997),52,2.692308
"'burbs, The (1989)",303,2.910891
...And Justice for All (1979),199,3.713568
1-900 (1994),2,2.5
10 Things I Hate About You (1999),700,3.422857
101 Dalmatians (1961),565,3.59646
101 Dalmatians (1996),364,3.046703
12 Angry Men (1957),616,4.295455


In [6]:
# 평점을 기준으로 sorting
best_movies.sort_values([('rating', 'mean')], ascending=False).head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Ulysses (Ulisse) (1954),1,5.0
Lured (1947),1,5.0
Follow the Bitch (1998),1,5.0
Bittersweet Motel (2000),1,5.0
Song of Freedom (1936),1,5.0


In [7]:
# 현실은...
# 평점 갯수가 1인데 5점 짜리 평점 때문에 높게 나오는 상황
# 평점 갯수가 적어도 100개 이상이 되는 영화를 대상으로 설정 = boolean index를 활용
movies_atleast_100 = best_movies['rating']['size'] >= 100
movies_atleast_100.head()

title
$1,000,000 Duck (1971)           False
'Night Mother (1986)             False
'Til There Was You (1997)        False
'burbs, The (1989)                True
...And Justice for All (1979)     True
Name: size, dtype: bool

In [8]:
# 위의 조건을 만족하는 영화들만을 대상으로 정렬
best_movies[movies_atleast_100].sort_values([('rating', 'mean')], ascending=False)[:20]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),628,4.56051
"Shawshank Redemption, The (1994)",2227,4.554558
"Godfather, The (1972)",2223,4.524966
"Close Shave, A (1995)",657,4.520548
"Usual Suspects, The (1995)",1783,4.517106
Schindler's List (1993),2304,4.510417
"Wrong Trousers, The (1993)",882,4.507937
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),470,4.491489
Raiders of the Lost Ark (1981),2514,4.477725
Rear Window (1954),1050,4.47619
