In [3]:
import pandas as pd
import os

In [4]:
upath = os.path.expanduser('d:/github/jupyter/python_for_da/data/171005_movies/users.dat')
rpath = os.path.expanduser('d:/github/jupyter/python_for_da/data/171005_movies/ratings.dat')
mpath = os.path.expanduser('d:/github/jupyter/python_for_da/data/171005_movies/movies.dat')

In [5]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']

### 판다스 데이터 프레임 사용법

In [15]:
users = pd.read_csv(upath, sep = '::', header = None, names =unames, engine = 'python')
ratings = pd.read_csv(rpath, sep = '::', header = None, names =rnames, engine = 'python')
movies = pd.read_csv(mpath, sep = '::', header = None, names =mnames, engine = 'python')

In [18]:
data = pd.merge(pd.merge(ratings,users), movies)

### .loc 사용법

In [25]:
data.loc[0]

user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupatino                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

In [28]:
data[:1]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupatino,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama


### Pivot table, 원하는대로 짜집기 (R의 apply와 유사한 기능)

In [31]:
mean_ratings = data.pivot_table('rating', index = 'title', columns = 'gender', aggfunc = 'mean')

In [74]:
mean_ratings[:5]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"'burbs, The (1989)",2.793478,2.962085,0.168607
10 Things I Hate About You (1999),3.646552,3.311966,-0.334586
101 Dalmatians (1961),3.791444,3.5,-0.291444
101 Dalmatians (1996),3.24,2.911215,-0.328785
12 Angry Men (1957),4.184397,4.328421,0.144024


### .groupby(열이름) .size() 열이름 별로 수량 파악

In [36]:
ratings_by_title = data.groupby('title').size()
ratings_by_title[:5]

title
$1,000,000 Duck (1971)            37
'Night Mother (1986)              70
'Til There Was You (1997)         52
'burbs, The (1989)               303
...And Justice for All (1979)    199
dtype: int64

### .index[조건] (R에서 조건부 인덱싱 할때와 동일)

In [38]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)

### .loc[리스트] 리스트와 동일한 이름을 가진 자료만 return

In [75]:
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings[:5]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"'burbs, The (1989)",2.793478,2.962085,0.168607
10 Things I Hate About You (1999),3.646552,3.311966,-0.334586
101 Dalmatians (1961),3.791444,3.5,-0.291444
101 Dalmatians (1996),3.24,2.911215,-0.328785
12 Angry Men (1957),4.184397,4.328421,0.144024


### .sort_values(by = '', ascending = Ture) by 기준으로 정렬

In [76]:
top_female_ratings = mean_ratings.sort_values(by='F', ascending = False)
top_female_ratings[:5]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Close Shave, A (1995)",4.644444,4.473795,-0.17065
"Wrong Trousers, The (1993)",4.588235,4.478261,-0.109974
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589,-0.10806
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075,-0.178032
Schindler's List (1993),4.562602,4.491415,-0.071187


In [77]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by = 'diff')
sorted_by_diff[:5]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,-0.830782
Jumpin' Jack Flash (1986),3.254717,2.578358,-0.676359
Grease (1978),3.975265,3.367041,-0.608224
Little Women (1994),3.870588,3.321739,-0.548849
Steel Magnolias (1989),3.901734,3.365957,-0.535777


In [63]:
ratings_std_by_title = data.groupby('title')['rating'].std()
ratings_std_by_title[:10]

title
$1,000,000 Duck (1971)               1.092563
'Night Mother (1986)                 1.118636
'Til There Was You (1997)            1.020159
'burbs, The (1989)                   1.107760
...And Justice for All (1979)        0.878110
1-900 (1994)                         0.707107
10 Things I Hate About You (1999)    0.989815
101 Dalmatians (1961)                0.982103
101 Dalmatians (1996)                1.098717
12 Angry Men (1957)                  0.812731
Name: rating, dtype: float64

In [64]:
ratings_std_by_title = ratings_std_by_title.loc[active_titles]
ratings_std_by_title[:10]

title
'burbs, The (1989)                     1.107760
10 Things I Hate About You (1999)      0.989815
101 Dalmatians (1961)                  0.982103
101 Dalmatians (1996)                  1.098717
12 Angry Men (1957)                    0.812731
13th Warrior, The (1999)               1.140421
2 Days in the Valley (1996)            0.921592
20,000 Leagues Under the Sea (1954)    0.869685
2001: A Space Odyssey (1968)           1.042504
2010 (1984)                            0.946618
Name: rating, dtype: float64

In [66]:
ratings_std_by_title.sort_values(ascending=False)

title
Dumb & Dumber (1994)                                                   1.321333
Blair Witch Project, The (1999)                                        1.316368
Natural Born Killers (1994)                                            1.307198
Tank Girl (1995)                                                       1.277695
Rocky Horror Picture Show, The (1975)                                  1.260177
Eyes Wide Shut (1999)                                                  1.259624
Evita (1996)                                                           1.253631
Billy Madison (1995)                                                   1.249970
Fear and Loathing in Las Vegas (1998)                                  1.246408
Bicentennial Man (1999)                                                1.245533
Hellraiser (1987)                                                      1.243046
Babe: Pig in the City (1998)                                           1.239379
Wes Craven's New Nightmare (1994) 