# 영화 평점 분석 실습

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

## 1. 영화 평점 데이터 적재 및 전처리

In [2]:
# 사용자 데이터 읽어오기
users = pd.read_csv('data/movielens/users.dat', sep = '::', engine = 'python',
                   names = ['사용자아이디', '성별','연령','직업','지역'])
users.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [3]:
# 평점 데이터 읽어오기
ratings = pd.read_csv('data/movielens/ratings.dat', sep = '::', engine = 'python',
                   names = ['사용자아이디', '영화아이디','평점','타임스탬프'])
ratings.head()

Unnamed: 0,사용자아이디,영화아이디,평점,타임스탬프
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# 영화데이터 읽어오기
movies = pd.read_csv('data/movielens/movies.dat', sep = '::', engine = 'python',
                   names = ['영화아이디','영화제목','장르'], encoding = 'latin-1')
movies.head()

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
#3개의 데이터프레임을 하나로 합치기
data = pd.merge(users, ratings)
data = pd.merge(data, movies)
data.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [6]:
print('사용자 수:', len(users))
print('리뷰 수:', len(ratings))
print('영화 수:', len(movies))

사용자 수: 6040
리뷰 수: 1000209
영화 수: 3883


In [7]:
# 모든 사용자가 리뷰를 했나?
ratings.사용자아이디.nunique()



6040

In [8]:
# 리뷰가 없는 영화도 있는가?
ratings


Unnamed: 0,사용자아이디,영화아이디,평점,타임스탬프
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [9]:
ratings.nunique()
# 176개의 영화는 리뷰가 없음

사용자아이디      6040
영화아이디       3706
평점             5
타임스탬프     458455
dtype: int64

## 2. 보고 싶은 영화 찾기
영화들의 평점 평균을 구하여, 사람들에게 인정받는 (평점이 높은) 영화 찾기

In [10]:
# 영화들의 평점 평균을 구하여, 평점이 높은 영화 찾기 

In [11]:
data

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


In [12]:
data.pivot_table(index = '영화제목', aggfunc = 'mean', values = '평점').sort_values(by = '평점',
                                            ascending = False).head(10)

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
Ulysses (Ulisse) (1954),5.0
Lured (1947),5.0
Follow the Bitch (1998),5.0
Bittersweet Motel (2000),5.0
Song of Freedom (1936),5.0
One Little Indian (1973),5.0
Smashing Time (1967),5.0
Schlafes Bruder (Brother of Sleep) (1995),5.0
"Gate of Heavenly Peace, The (1995)",5.0
"Baby, The (1973)",5.0


In [13]:
data.pivot_table(index = '영화제목', aggfunc = 'mean', values = '평점').nlargest(10, '평점')

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
"Baby, The (1973)",5.0
Bittersweet Motel (2000),5.0
Follow the Bitch (1998),5.0
"Gate of Heavenly Peace, The (1995)",5.0
Lured (1947),5.0
One Little Indian (1973),5.0
Schlafes Bruder (Brother of Sleep) (1995),5.0
Smashing Time (1967),5.0
Song of Freedom (1936),5.0
Ulysses (Ulisse) (1954),5.0


In [14]:
# 중복된 영화제목이 있는지 확인
movies.nunique()

영화아이디    3883
영화제목     3883
장르        301
dtype: int64

In [15]:
data.pivot_table(index = ['영화아이디', '영화제목'],  aggfunc = 'mean', values = '평점').nlargest(10, '평점')

Unnamed: 0_level_0,Unnamed: 1_level_0,평점
영화아이디,영화제목,Unnamed: 2_level_1
787,"Gate of Heavenly Peace, The (1995)",5.0
989,Schlafes Bruder (Brother of Sleep) (1995),5.0
1830,Follow the Bitch (1998),5.0
3172,Ulysses (Ulisse) (1954),5.0
3233,Smashing Time (1967),5.0
3280,"Baby, The (1973)",5.0
3382,Song of Freedom (1936),5.0
3607,One Little Indian (1973),5.0
3656,Lured (1947),5.0
3881,Bittersweet Motel (2000),5.0


In [16]:
data

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


In [17]:
data.pivot_table(index = '영화제목', aggfunc = ['mean', 'count'], values = '평점').\
                                nlargest(10, ('mean', '평점'))

Unnamed: 0_level_0,mean,count
Unnamed: 0_level_1,평점,평점
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2
"Baby, The (1973)",5.0,1
Bittersweet Motel (2000),5.0,1
Follow the Bitch (1998),5.0,1
"Gate of Heavenly Peace, The (1995)",5.0,3
Lured (1947),5.0,1
One Little Indian (1973),5.0,1
Schlafes Bruder (Brother of Sleep) (1995),5.0,1
Smashing Time (1967),5.0,2
Song of Freedom (1936),5.0,1
Ulysses (Ulisse) (1954),5.0,1


In [18]:
# 평점 평균이 4.5이상이고, 평점의 개수가 1000개 이상인 영화를 보고싶은 영화로 지정
영화평점 = data.pivot_table(index = '영화제목', aggfunc = ['mean', 'count'], values = '평점')

In [19]:
영화평점.columns = ['평점평균', '평점개수']

In [20]:
영화평점

Unnamed: 0_level_0,평점평균,평점개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.027027,37
'Night Mother (1986),3.371429,70
'Til There Was You (1997),2.692308,52
"'burbs, The (1989)",2.910891,303
...And Justice for All (1979),3.713568,199
...,...,...
"Zed & Two Noughts, A (1985)",3.413793,29
Zero Effect (1998),3.750831,301
Zero Kelvin (Kjærlighetens kjøtere) (1995),3.500000,2
Zeus and Roxanne (1997),2.521739,23


In [21]:
영화평점[(영화평점.평점평균 >= 4.4) & (영화평점.평점개수 >= 1000)] 

Unnamed: 0_level_0,평점평균,평점개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
Casablanca (1942),4.412822,1669
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963),4.44989,1367
"Godfather, The (1972)",4.524966,2223
Raiders of the Lost Ark (1981),4.477725,2514
Rear Window (1954),4.47619,1050
Schindler's List (1993),4.510417,2304
"Shawshank Redemption, The (1994)",4.554558,2227
"Sixth Sense, The (1999)",4.406263,2459
Star Wars: Episode IV - A New Hope (1977),4.453694,2991
"Usual Suspects, The (1995)",4.517106,1783


평균 평점이 만점인 영화들이 최상위에 위치함. 
일반적으로 평점이 만점인 경우는 대부분 평점의 개수가 매우 적은 경우이므로, 이를 확인하기 위해 평점의 개수도 함께 구해본다. 

## [실습 #1] 여자들이 좋아하는 영화 찾기 
### - 여성 평점이 4.0 이상이고 여성 평점의 개수가 500개 이상인 영화

In [22]:
# 1) 여성이 매긴 평점 데이터만 활용
data[data.성별 == 'F'].pivot_table(index = '영화제목', values = '평점',
                                aggfunc = ['mean', 'count'])

Unnamed: 0_level_0,mean,count
Unnamed: 0_level_1,평점,평점
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",3.375000,16
'Night Mother (1986),3.388889,36
'Til There Was You (1997),2.675676,37
"'burbs, The (1989)",2.793478,92
...And Justice for All (1979),3.828571,35
...,...,...
Your Friends and Neighbors (1998),2.888889,27
"Zed & Two Noughts, A (1985)",3.500000,8
Zero Effect (1998),3.864407,59
Zeus and Roxanne (1997),2.777778,9


In [23]:
# 2) 영화별 성적 평점을 구하기
ex1 = data.pivot_table(index = '영화제목', columns = '성별', values = '평점',
                                aggfunc = ['mean', 'count'])

In [24]:
ex1[(ex1[('mean', 'F')] >= 4.0) & (ex1[('count', 'F')] >= 500)]

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
American Beauty (1999),4.238901,4.347301,946.0,2482.0
Being John Malkovich (1999),4.15993,4.113636,569.0,1672.0
Braveheart (1995),4.016484,4.297839,546.0,1897.0
Casablanca (1942),4.30099,4.46134,505.0,1164.0
E.T. the Extra-Terrestrial (1982),4.08985,3.920264,601.0,1668.0
Fargo (1996),4.217656,4.26778,657.0,1856.0
Forrest Gump (1994),4.045031,4.105806,644.0,1550.0
L.A. Confidential (1997),4.106007,4.256678,566.0,1722.0
"Matrix, The (1999)",4.128405,4.362235,514.0,2076.0
"Princess Bride, The (1987)",4.342767,4.288942,636.0,1682.0


In [25]:
####### 내가 푼 방법 ########

In [26]:
여영 = data[(data['성별'] == 'F')] \
.pivot_table(index = '영화제목', aggfunc = ['mean', 'count'], values = '평점')

In [27]:
여영

Unnamed: 0_level_0,mean,count
Unnamed: 0_level_1,평점,평점
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",3.375000,16
'Night Mother (1986),3.388889,36
'Til There Was You (1997),2.675676,37
"'burbs, The (1989)",2.793478,92
...And Justice for All (1979),3.828571,35
...,...,...
Your Friends and Neighbors (1998),2.888889,27
"Zed & Two Noughts, A (1985)",3.500000,8
Zero Effect (1998),3.864407,59
Zeus and Roxanne (1997),2.777778,9


In [28]:
여영.columns = ['평점평균', '평점개수']

In [29]:
여영[(여영.평점평균 >= 4.0) & (여영.평점개수 >= 500)]

Unnamed: 0_level_0,평점평균,평점개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
American Beauty (1999),4.238901,946
Being John Malkovich (1999),4.15993,569
Braveheart (1995),4.016484,546
Casablanca (1942),4.30099,505
E.T. the Extra-Terrestrial (1982),4.08985,601
Fargo (1996),4.217656,657
Forrest Gump (1994),4.045031,644
L.A. Confidential (1997),4.106007,566
"Matrix, The (1999)",4.128405,514
"Princess Bride, The (1987)",4.342767,636


In [30]:
###############

## [실습 #2] 실습 #1에서 구한 영화(여성인기영화)의 장르를 분석해 보자.
여성인기영화의 장르 통계 구하기

예를 들어, 여성인기영화 중 Drama 장르의 영화는 10개, Action 영화는 3개, ...

In [31]:
여성인기영화 = ex1[(ex1[('mean', 'F')] >= 4.0) & (ex1[('count', 'F')] >= 500)].index

In [32]:
여성인기영화

Index(['American Beauty (1999)', 'Being John Malkovich (1999)',
       'Braveheart (1995)', 'Casablanca (1942)',
       'E.T. the Extra-Terrestrial (1982)', 'Fargo (1996)',
       'Forrest Gump (1994)', 'L.A. Confidential (1997)', 'Matrix, The (1999)',
       'Princess Bride, The (1987)', 'Pulp Fiction (1994)',
       'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',
       'Schindler's List (1993)', 'Shakespeare in Love (1998)',
       'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)',
       'Sixth Sense, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Toy Story (1995)', 'Wizard of Oz, The (1939)'],
      dtype='object', name='영화제목')

In [33]:
movies

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [34]:
ex2 = movies[movies.영화제목.isin(여성인기영화)]

In [35]:
ex2

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
108,110,Braveheart (1995),Action|Drama|War
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
293,296,Pulp Fiction (1994),Crime|Drama
315,318,"Shawshank Redemption, The (1994)",Drama
352,356,Forrest Gump (1994),Comedy|Romance|War
523,527,Schindler's List (1993),Drama|War
589,593,"Silence of the Lambs, The (1991)",Drama|Thriller
604,608,Fargo (1996),Crime|Drama|Thriller
900,912,Casablanca (1942),Drama|Romance|War


In [36]:
ex2.장르

0              Animation|Children's|Comedy
108                       Action|Drama|War
257        Action|Adventure|Fantasy|Sci-Fi
293                            Crime|Drama
315                                  Drama
352                     Comedy|Romance|War
523                              Drama|War
589                         Drama|Thriller
604                   Crime|Drama|Thriller
900                      Drama|Romance|War
907     Adventure|Children's|Drama|Musical
1081       Children's|Drama|Fantasy|Sci-Fi
1178     Action|Adventure|Drama|Sci-Fi|War
1179       Action|Adventure|Comedy|Romance
1180                      Action|Adventure
1575      Crime|Film-Noir|Mystery|Thriller
1959                      Action|Drama|War
2327                        Comedy|Romance
2502                Action|Sci-Fi|Thriller
2693                              Thriller
2789                          Comedy|Drama
2928                                Comedy
Name: 장르, dtype: object

In [37]:
ex2 = ex2.장르.str.split('|', expand = True)

In [38]:
ex2

Unnamed: 0,0,1,2,3,4
0,Animation,Children's,Comedy,,
108,Action,Drama,War,,
257,Action,Adventure,Fantasy,Sci-Fi,
293,Crime,Drama,,,
315,Drama,,,,
352,Comedy,Romance,War,,
523,Drama,War,,,
589,Drama,Thriller,,,
604,Crime,Drama,Thriller,,
900,Drama,Romance,War,,


In [39]:
# 장르1 = ex2[0].value_counts()
# 장르2 = ex2[1].value_counts()
# 장르3 = ex2[2].value_counts()
# 장르4 = ex2[3].value_counts()
# 장르5 = ex2[4].value_counts()

In [40]:
# 장르1.add(장르4, fill_value = 0).add(장르3, fill_value = 0) ...

In [41]:
sr = Series()
for col in ex2.columns:
    sr = sr.add(ex2[col].value_counts(), fill_value = 0)

  sr = Series()


In [42]:
sr

Action         7.0
Adventure      5.0
Animation      1.0
Children's     3.0
Comedy         6.0
Crime          3.0
Drama         12.0
Fantasy        2.0
Film-Noir      1.0
Musical        1.0
Mystery        1.0
Romance        4.0
Sci-Fi         4.0
Thriller       5.0
War            6.0
dtype: float64

In [43]:
############## 나의 풀이 ###############

In [44]:
여성인기영화 = 여영[(여영.평점평균 >= 4.0) & (여영.평점개수 >= 500)]

In [45]:
여성인기영화.index

Index(['American Beauty (1999)', 'Being John Malkovich (1999)',
       'Braveheart (1995)', 'Casablanca (1942)',
       'E.T. the Extra-Terrestrial (1982)', 'Fargo (1996)',
       'Forrest Gump (1994)', 'L.A. Confidential (1997)', 'Matrix, The (1999)',
       'Princess Bride, The (1987)', 'Pulp Fiction (1994)',
       'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',
       'Schindler's List (1993)', 'Shakespeare in Love (1998)',
       'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)',
       'Sixth Sense, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Toy Story (1995)', 'Wizard of Oz, The (1939)'],
      dtype='object', name='영화제목')

In [46]:
여성인기영화

Unnamed: 0_level_0,평점평균,평점개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
American Beauty (1999),4.238901,946
Being John Malkovich (1999),4.15993,569
Braveheart (1995),4.016484,546
Casablanca (1942),4.30099,505
E.T. the Extra-Terrestrial (1982),4.08985,601
Fargo (1996),4.217656,657
Forrest Gump (1994),4.045031,644
L.A. Confidential (1997),4.106007,566
"Matrix, The (1999)",4.128405,514
"Princess Bride, The (1987)",4.342767,636


In [47]:
여성인기영화.reset_index()

Unnamed: 0,영화제목,평점평균,평점개수
0,American Beauty (1999),4.238901,946
1,Being John Malkovich (1999),4.15993,569
2,Braveheart (1995),4.016484,546
3,Casablanca (1942),4.30099,505
4,E.T. the Extra-Terrestrial (1982),4.08985,601
5,Fargo (1996),4.217656,657
6,Forrest Gump (1994),4.045031,644
7,L.A. Confidential (1997),4.106007,566
8,"Matrix, The (1999)",4.128405,514
9,"Princess Bride, The (1987)",4.342767,636


In [48]:
data2 = pd.merge(여성인기영화.reset_index(), movies)

In [49]:
data2

Unnamed: 0,영화제목,평점평균,평점개수,영화아이디,장르
0,American Beauty (1999),4.238901,946,2858,Comedy|Drama
1,Being John Malkovich (1999),4.15993,569,2997,Comedy
2,Braveheart (1995),4.016484,546,110,Action|Drama|War
3,Casablanca (1942),4.30099,505,912,Drama|Romance|War
4,E.T. the Extra-Terrestrial (1982),4.08985,601,1097,Children's|Drama|Fantasy|Sci-Fi
5,Fargo (1996),4.217656,657,608,Crime|Drama|Thriller
6,Forrest Gump (1994),4.045031,644,356,Comedy|Romance|War
7,L.A. Confidential (1997),4.106007,566,1617,Crime|Film-Noir|Mystery|Thriller
8,"Matrix, The (1999)",4.128405,514,2571,Action|Sci-Fi|Thriller
9,"Princess Bride, The (1987)",4.342767,636,1197,Action|Adventure|Comedy|Romance


In [50]:
#data2.value_counts('장르').split('|')
# data2['장르'].str.split('|').value_count
data2['장르'].str.split('|').explode().value_counts()

Drama         12
Action         7
War            6
Comedy         6
Thriller       5
Adventure      5
Sci-Fi         4
Romance        4
Children's     3
Crime          3
Fantasy        2
Musical        1
Mystery        1
Film-Noir      1
Animation      1
Name: 장르, dtype: int64

In [51]:
data

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


## [실습 #3] 남자와 여자의 호불호가 크게 갈리는 영화 10개 찾기
전체 평점의 개수가 500개 이상인 영화만 대상으로 함.


In [52]:
# 평점의 차이로 계산

In [53]:
ex3 = data.pivot_table(index = '영화제목', columns = '성별', values = '평점',
                                aggfunc = ['mean', 'count'])

In [54]:
ex3

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"$1,000,000 Duck (1971)",3.375000,2.761905,16.0,21.0
'Night Mother (1986),3.388889,3.352941,36.0,34.0
'Til There Was You (1997),2.675676,2.733333,37.0,15.0
"'burbs, The (1989)",2.793478,2.962085,92.0,211.0
...And Justice for All (1979),3.828571,3.689024,35.0,164.0
...,...,...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952,8.0,21.0
Zero Effect (1998),3.864407,3.723140,59.0,242.0
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000,,2.0
Zeus and Roxanne (1997),2.777778,2.357143,9.0,14.0


In [55]:
ex3 = ex3[ex3[('count', 'F')] + ex3[('count', 'M')] >= 500]
# ex3 = ex3[ex3['count'].sum(axis = 1) >= 500]

In [56]:
ex3

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
10 Things I Hate About You (1999),3.646552,3.311966,232.0,468.0
101 Dalmatians (1961),3.791444,3.500000,187.0,378.0
12 Angry Men (1957),4.184397,4.328421,141.0,475.0
"13th Warrior, The (1999)",3.112000,3.168000,125.0,625.0
"20,000 Leagues Under the Sea (1954)",3.670103,3.709205,97.0,478.0
...,...,...,...,...
"X-Files: Fight the Future, The (1998)",3.489474,3.493797,190.0,806.0
X-Men (2000),3.682310,3.851702,277.0,1234.0
You've Got Mail (1998),3.542424,3.275591,330.0,508.0
Young Frankenstein (1974),4.289963,4.239177,269.0,924.0


In [57]:
abs(ex3[('mean', 'F')] - ex3[('mean', 'M')]).nlargest(10)

영화제목
Dirty Dancing (1987)                      0.830782
Good, The Bad and The Ugly, The (1966)    0.726351
Dumb & Dumber (1994)                      0.638608
Evil Dead II (Dead By Dawn) (1987)        0.611985
Grease (1978)                             0.608224
Caddyshack (1980)                         0.573602
Animal House (1978)                       0.538286
Exorcist, The (1973)                      0.529605
Rocky Horror Picture Show, The (1975)     0.512885
Big Trouble in Little China (1986)        0.497078
dtype: float64

In [58]:
ex3['남녀차이'] = abs(ex3[('mean', 'F')] - ex3[('mean', 'M')])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ex3['남녀차이'] = abs(ex3[('mean', 'F')] - ex3[('mean', 'M')])


In [59]:
ex3.nlargest(10, '남녀차이')

Unnamed: 0_level_0,mean,mean,count,count,남녀차이
성별,F,M,F,M,Unnamed: 5_level_1
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dirty Dancing (1987),3.790378,2.959596,291.0,396.0,0.830782
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,99.0,723.0,0.726351
Dumb & Dumber (1994),2.697987,3.336595,149.0,511.0,0.638608
Evil Dead II (Dead By Dawn) (1987),3.297297,3.909283,74.0,474.0,0.611985
Grease (1978),3.975265,3.367041,283.0,534.0,0.608224
Caddyshack (1980),3.396135,3.969737,207.0,760.0,0.573602
Animal House (1978),3.628906,4.167192,256.0,951.0,0.538286
"Exorcist, The (1973)",3.537634,4.067239,186.0,699.0,0.529605
"Rocky Horror Picture Show, The (1975)",3.673016,3.160131,315.0,918.0,0.512885
Big Trouble in Little China (1986),2.987952,3.48503,83.0,501.0,0.497078


In [60]:
data3 = data.pivot_table(index = '영화제목', aggfunc = 'count', values = '평점')

In [61]:
data3

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
"$1,000,000 Duck (1971)",37
'Night Mother (1986),70
'Til There Was You (1997),52
"'burbs, The (1989)",303
...And Justice for All (1979),199
...,...
"Zed & Two Noughts, A (1985)",29
Zero Effect (1998),301
Zero Kelvin (Kjærlighetens kjøtere) (1995),2
Zeus and Roxanne (1997),23


In [62]:
data3 = data3[data3.평점 >= 500]

In [63]:
data3

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
10 Things I Hate About You (1999),700
101 Dalmatians (1961),565
12 Angry Men (1957),616
"13th Warrior, The (1999)",750
"20,000 Leagues Under the Sea (1954)",575
...,...
"X-Files: Fight the Future, The (1998)",996
X-Men (2000),1511
You've Got Mail (1998),838
Young Frankenstein (1974),1193


In [64]:
data4 = data[data.영화제목.isin(data3.index)]

In [65]:
data4

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
905295,5990,F,25,20,90046,1228,4,956868450,Raging Bull (1980),Drama
905296,6002,M,50,0,43231,1228,4,960006075,Raging Bull (1980),Drama
905297,6016,M,45,1,37209,1228,4,956957464,Raging Bull (1980),Drama
905298,6036,F,25,15,32603,1228,5,956711048,Raging Bull (1980),Drama


In [66]:
data4[data4.성별 == 'F'].pivot_table(index = '영화제목', aggfunc = 'mean', values = '평점' )

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
10 Things I Hate About You (1999),3.646552
101 Dalmatians (1961),3.791444
12 Angry Men (1957),4.184397
"13th Warrior, The (1999)",3.112000
"20,000 Leagues Under the Sea (1954)",3.670103
...,...
"X-Files: Fight the Future, The (1998)",3.489474
X-Men (2000),3.682310
You've Got Mail (1998),3.542424
Young Frankenstein (1974),4.289963


In [67]:
data4[data4.성별 == 'M'].pivot_table(index = '영화제목', aggfunc = 'mean', values = '평점' )

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
10 Things I Hate About You (1999),3.311966
101 Dalmatians (1961),3.500000
12 Angry Men (1957),4.328421
"13th Warrior, The (1999)",3.168000
"20,000 Leagues Under the Sea (1954)",3.709205
...,...
"X-Files: Fight the Future, The (1998)",3.493797
X-Men (2000),3.851702
You've Got Mail (1998),3.275591
Young Frankenstein (1974),4.239177


In [68]:
data5= data4[data4.성별 == 'F'].pivot_table(index = '영화제목', aggfunc = 'mean', values = '평점' ) - data4[data4.성별 == 'M'].pivot_table(index = '영화제목', aggfunc = 'mean', values = '평점' )

In [69]:
data5 = data5.abs()

In [70]:
data5.columns = ['평점차이']

In [71]:
data5.sort_values(by = '평점차이', ascending = False).head(10)

Unnamed: 0_level_0,평점차이
영화제목,Unnamed: 1_level_1
Dirty Dancing (1987),0.830782
"Good, The Bad and The Ugly, The (1966)",0.726351
Dumb & Dumber (1994),0.638608
Evil Dead II (Dead By Dawn) (1987),0.611985
Grease (1978),0.608224
Caddyshack (1980),0.573602
Animal House (1978),0.538286
"Exorcist, The (1973)",0.529605
"Rocky Horror Picture Show, The (1975)",0.512885
Big Trouble in Little China (1986),0.497078


## [실습 #4] 연령대 별로 영화 평점 분석하기
연령대(10대 미만, 10대, 20대, ...50대) 컬럼을 추가한 후, 영화별 연령대별 영화평점 구하기

In [72]:
data

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


In [73]:
# 함수 정의
def generate_ages(X):
    if X < 10:
            return '10대 미만'
    elif X < 20:
            return '10대'
    elif X < 30:
            return '20대'
    elif X < 40:    
            return '30대'
    elif X < 50:  
            return '40대'
    else:
            return '50대 이상'

In [74]:
data['연령대'] = data.연령.apply(generate_ages)

In [75]:
data

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르,연령대
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,10대 미만
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama,50대 이상
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama,20대
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama,20대
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama,50대 이상
...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,M,18,17,47901,2198,5,958846401,Modulations (1998),Documentary,10대
1000205,5675,M,35,14,30030,2703,3,976029116,Broken Vessels (1998),Drama,30대
1000206,5780,M,18,17,92886,2845,1,958153068,White Boys (1999),Drama,10대
1000207,5851,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western,10대


In [76]:
ex4 = data.pivot_table(index = '영화제목', columns = '연령대',
                aggfunc = 'mean', values = '평점')

In [77]:
ex4

연령대,10대,10대 미만,20대,30대,40대,50대 이상
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"$1,000,000 Duck (1971)",3.000000,,3.090909,3.133333,2.000000,2.750000
'Night Mother (1986),4.666667,2.000000,3.423077,2.904762,3.833333,3.750000
'Til There Was You (1997),2.500000,3.500000,2.666667,2.900000,2.333333,2.600000
"'burbs, The (1989)",3.244444,4.500000,2.652174,2.818182,2.545455,3.100000
...And Justice for All (1979),3.428571,3.000000,3.724138,3.657143,4.100000,3.674419
...,...,...,...,...,...,...
"Zed & Two Noughts, A (1985)",3.000000,1.000000,3.375000,3.777778,4.000000,3.000000
Zero Effect (1998),3.883333,4.125000,3.715278,3.608696,3.764706,3.769231
Zero Kelvin (Kjærlighetens kjøtere) (1995),,,,3.500000,,
Zeus and Roxanne (1997),2.500000,1.500000,2.833333,3.500000,1.000000,
