In [2]:
import pandas as pd
import numpy as np

# Series

In [3]:
s = pd.Series([9904312,3448737,2890451,2466052],
              index=['서울','부산','인천','대구'], dtype=np.int32)

In [5]:
s.values

array([9904312, 3448737, 2890451, 2466052], dtype=int32)

In [6]:
s.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

In [8]:
s.name = '인구'
s.index.name = '도시'
s

도시
서울    9904312
부산    3448737
인천    2890451
대구    2466052
Name: 인구, dtype: int32

In [9]:
s / 1000000

도시
서울    9.904312
부산    3.448737
인천    2.890451
대구    2.466052
Name: 인구, dtype: float64

시리즈 인덱싱

In [None]:
s[1], s['부산'], s.부산

In [12]:
s[[0,3,1]]

도시
서울    9904312
대구    2466052
부산    3448737
Name: 인구, dtype: int32

In [13]:
# slicing
s[1:3]

도시
부산    3448737
인천    2890451
Name: 인구, dtype: int32

In [14]:
s['부산':'대구']

도시
부산    3448737
인천    2890451
대구    2466052
Name: 인구, dtype: int32

In [15]:
# 필터링
s[(2500000 < s) & (s < 3000000)] #  100 < s < 200 는 안됨

도시
인천    2890451
Name: 인구, dtype: int32

시리즈와 딕셔너리

In [17]:
'서울' in s, '대전' in s

(True, False)

In [None]:
s2 = pd.Series({"서울":9631482, "부산":3393191,
                "인천":2632035, "대전":1490158})
s2

In [20]:
# 인덱스 기반 연산
s - s2

대구         NaN
대전         NaN
부산     55546.0
서울    272830.0
인천    258416.0
dtype: float64

In [22]:
s.values - s2.values

array([272830,  55546, 258416, 975894])

In [23]:
rs = s - s2

In [25]:
rs.notnull()

대구    False
대전    False
부산     True
서울     True
인천     True
dtype: bool

In [27]:
rs = rs[rs.notnull()]
rs

부산     55546.0
서울    272830.0
인천    258416.0
dtype: float64

In [29]:
rs['부산'] = 1
rs

부산         1.0
서울    272830.0
인천    258416.0
dtype: float64

In [31]:
del rs['부산']

In [32]:
rs

서울    272830.0
인천    258416.0
dtype: float64

DataFame

In [36]:
df = pd.read_csv('pandas_review.csv')
df

Unnamed: 0.1,Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
0,서울,수도권,9904312,9631482,9762546,9853972,0.0283
1,부산,경상권,3448737,3393191,3512547,3655437,0.0163
2,인천,수도권,2890451,2632035,2517680,2466338,0.0982
3,대구,경상권,2466052,2431774,2456016,2473990,0.0141


In [46]:
df.set_index('Unnamed: 0', inplace=True)

In [47]:
df.지역.values

array(['수도권', '경상권', '수도권', '경상권'], dtype=object)

In [48]:
df['2015'].values

array([9904312, 3448737, 2890451, 2466052])

In [49]:
df.columns

Index(['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율'], dtype='object', name='특성')

In [50]:
df.index.name = '도시'
df.columns.name = '특성'
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


In [51]:
df.T

도시,서울,부산,인천,대구
특성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2015,9904312,3448737,2890451,2466052
2010,9631482,3393191,2632035,2431774
2005,9762546,3512547,2517680,2456016
2000,9853972,3655437,2466338,2473990
2010-2015 증가율,0.0283,0.0163,0.0982,0.0141


열 데이터의 갱신, 추가, 삭제

In [52]:
df['2010-2015 증가율'] = df['2010-2015 증가율'] * 100

In [None]:
df['2020'] = [1,2,3,4]
df

In [56]:
# 컬럼삭제
del df['2020']
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83
부산,경상권,3448737,3393191,3512547,3655437,1.63
인천,수도권,2890451,2632035,2517680,2466338,9.82
대구,경상권,2466052,2431774,2456016,2473990,1.41


In [57]:
df['지역']

도시
서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

In [59]:
# 2015년추가하여 데이터프레임
df[['지역', '2015']]

특성,지역,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,수도권,9904312
부산,경상권,3448737
인천,수도권,2890451
대구,경상권,2466052


In [61]:
df[['지역']] # 시리즈도 데이터프레임으로 변경

특성,지역
도시,Unnamed: 1_level_1
서울,수도권
부산,경상권
인천,수도권
대구,경상권


# 행인덱싱

In [65]:
df[:'서울']

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83


In [67]:
df[1:2]

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
부산,경상권,3448737,3393191,3512547,3655437,1.63


In [69]:
df["부산":"인천"]

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
부산,경상권,3448737,3393191,3512547,3655437,1.63
인천,수도권,2890451,2632035,2517680,2466338,9.82


# 개별 데이터 인덱싱

In [71]:
df[:'서울']['2015']

도시
서울    9904312
Name: 2015, dtype: int64

In [72]:
df['2015']['서울']

9904312

In [75]:
df['2015'][0]

9904312

# 판다스 데이터 입출력

In [76]:
# 매직명령어
%%writefile sample1.csv 
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample1.csv


In [None]:
pd.read_csv('sample1.csv')

In [None]:
%%writefile sample2.csv 
1, 1.11, one
2, 2.22, two
3, 3.33, three

In [79]:
pd.read_csv('sample2.csv')

Unnamed: 0,1,1.11,one
0,2,2.22,two
1,3,3.33,three


In [80]:
pd.read_csv('sample2.csv', header=None)

Unnamed: 0,0,1,2
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [81]:
pd.read_csv('sample2.csv', names=['c1', 'c2', 'c3'])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [83]:
pd.read_csv('sample2.csv', names=['c1', 'c2', 'c3'], index_col='c1')

Unnamed: 0_level_0,c2,c3
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.11,one
2,2.22,two
3,3.33,three


In [84]:
%%writefile sample3.txt 
파일 제목 : sample3.txt
데이터 포멧의 설명 :
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample3.txt


In [86]:
pd.read_csv('sample3.txt', skiprows=[0, 1])

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [93]:
%%writefile sample4.csv
c1| c2| c3
1.0|1.11| one
2.0|| two
|3.33| three

Overwriting sample4.csv


In [94]:
!cat sample4.csv

c1| c2| c3
1.0|1.11| one
2.0|| two
|3.33| three


In [95]:
pd.read_csv('sample4.csv', sep='|')

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,,two
2,,3.33,three


# 고급인덱싱

In [96]:
df = pd.DataFrame(np.arange(10, 22).reshape(3,4),
                  index = ['x', 'y', 'z'],
                  columns = ["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
x,10,11,12,13
y,14,15,16,17
z,18,19,20,21


In [98]:
df.loc['x']   # df[:'x']

A    10
B    11
C    12
D    13
Name: x, dtype: int64

In [101]:
df.loc['x':'y']

Unnamed: 0,A,B,C,D
x,10,11,12,13
y,14,15,16,17


In [103]:
df.loc[['x','z']]

Unnamed: 0,A,B,C,D
x,10,11,12,13
z,18,19,20,21


In [105]:
df['A'] > 15

x    False
y    False
z     True
Name: A, dtype: bool

In [107]:
df.loc[df.A > 15]

Unnamed: 0,A,B,C,D
z,18,19,20,21


인덱서 2개를 받는 경우

In [109]:
df.loc['x','A']

10

In [111]:
df.loc['y':, 'B']

y    15
z    19
Name: B, dtype: int64

In [113]:
df.loc[df.A >10, 'A']

y    14
z    18
Name: A, dtype: int64

iloc 인덱서

In [114]:
df.iloc[0,0]

10

In [116]:
df.iloc[:2, 1:]

Unnamed: 0,B,C,D
x,11,12,13
y,15,16,17


In [118]:
df

Unnamed: 0,A,B,C,D
x,10,11,12,13
y,14,15,16,17
z,18,19,20,21


In [117]:
df.iloc[-1]

A    18
B    19
C    20
D    21
Name: z, dtype: int64

# 데이터 조작

데이터 갯수 세기

In [119]:
s = pd.Series(range(10))

In [121]:
s[3] = np.nan

In [123]:
s.count()

9

In [127]:
df = pd.DataFrame(np.arange(16).reshape(4,4), dtype=float)
df.iloc[2,3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,
3,12.0,13.0,14.0,15.0


In [128]:
df.count()

0    4
1    4
2    4
3    3
dtype: int64

In [None]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head(2)

In [130]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

카테고리값 세기

In [132]:
titanic.sex.value_counts()

male      577
female    314
Name: sex, dtype: int64

In [133]:
titanic.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

정렬

In [None]:
np.random.seed(2022)
s2 = pd.Series(np.random.randint(6, size=100))
s2

In [136]:
s2.value_counts()

5    22
0    20
1    17
2    16
4    13
3    12
dtype: int64

In [138]:
s2.value_counts().sort_values()

3    12
4    13
2    16
1    17
0    20
5    22
dtype: int64

In [139]:
s2.value_counts().sort_values(ascending=False)

5    22
0    20
1    17
2    16
4    13
3    12
dtype: int64

In [141]:
titanic.sort_values(by='fare', ascending=False).head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False


In [142]:
titanic.sort_values(by=['fare','age']).head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True


# 행/열 합계

In [144]:
np.random.seed(2021)
df2 = pd.DataFrame(np.random.randint(10, size=(4,8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,4,5,9,0,6,5,8,6
1,6,6,6,1,5,7,1,1
2,5,2,0,3,1,0,2,6
3,4,8,5,1,6,7,5,6


In [147]:
df2.sum()

0    19
1    21
2    20
3     5
4    18
5    19
6    16
7    19
dtype: int64

In [146]:
df2.sum(axis=1)

0    43
1    33
2    19
3    42
dtype: int64

In [149]:
df2['RowSum'] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,4,5,9,0,6,5,8,6,49.0
1,6,6,6,1,5,7,1,1,55.0
2,5,2,0,3,1,0,2,6,46.0
3,4,8,5,1,6,7,5,6,96.0


In [150]:
df2.loc['colSum'] = df2.sum(axis=0)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,4.0,5.0,9.0,0.0,6.0,5.0,8.0,6.0,49.0
1,6.0,6.0,6.0,1.0,5.0,7.0,1.0,1.0,55.0
2,5.0,2.0,0.0,3.0,1.0,0.0,2.0,6.0,46.0
3,4.0,8.0,5.0,1.0,6.0,7.0,5.0,6.0,96.0
colSum,19.0,21.0,20.0,5.0,18.0,19.0,16.0,19.0,246.0


apply 변환

In [151]:
df3 = pd.DataFrame({
    'A' : [1,3,4,3,4],
    'B' : [2,3,1,2,3],
    'C' : [1,5,2,4,4]})
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [152]:
df3.apply(lambda x : x.max() - x.min())

A    3
B    2
C    4
dtype: int64

In [153]:
df3.apply(lambda x : x.max() - x.min(), axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [154]:
titanic['성년'] = titanic.apply(lambda x: "성년" if x.age >= 20 else '미성년', axis=1)

In [None]:
titanic.head()

fillna() method

In [158]:
df3.apply(pd.value_counts)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [161]:
df3.apply(pd.value_counts).fillna(0.0)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


astype() method

In [163]:
df3.apply(pd.value_counts).fillna(0.0).astype(int)

Unnamed: 0,A,B,C
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


cut , qcut 실수값을 카테고리값으로 변환

In [164]:
ages = [0, 2, 10, 21, 23, 37, 31, 61, 20, 41, 32, 101]

In [165]:
bins = [1, 20, 30, 50, 70, 100]
labels = ['미성년', '청년', '중년', '장년', '노년']
cate = pd.cut(ages, bins, labels=labels)

In [167]:
cate.codes

array([-1,  0,  0,  1,  1,  2,  2,  3,  0,  2,  2, -1], dtype=int8)

In [168]:
# qcut
np.random.seed(2022)
data = np.random.randn(1000)
cate = pd.qcut(data, 4, labels=["Q1", "Q2", "Q3", "Q4"])
cate

['Q2', 'Q2', 'Q2', 'Q4', 'Q3', ..., 'Q3', 'Q3', 'Q3', 'Q2', 'Q4']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [169]:
pd.value_counts(cate)

Q1    250
Q2    250
Q3    250
Q4    250
dtype: int64

# 데이터프레임 합성

merge, join, concatenate

In [None]:
df1 = pd.DataFrame({
    "고객번호" : [1001,1002, 1003, 1004, 1005, 1006, 1007 ],
    "이름": ['둘리', '도우너', '또치', '길동', '희동', '마이콜', '영희']
})

In [None]:
df2 = pd.DataFrame({
    "고객번호" : [1001, 1001, 1005, 1006, 1008, 1001],
    "금액": [10000, 20000, 15000, 5000, 100000, 3000]
})
df2

In [175]:
pd.merge(df1, df2) # how = 'inner'

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000
1,1001,둘리,20000
2,1001,둘리,3000
3,1005,희동,15000
4,1006,마이콜,5000


In [176]:
pd.merge(df1, df2, how='left')

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000.0
1,1001,둘리,20000.0
2,1001,둘리,3000.0
3,1002,도우너,
4,1003,또치,
5,1004,길동,
6,1005,희동,15000.0
7,1006,마이콜,5000.0
8,1007,영희,


In [177]:
#full outer join
pd.merge(df1, df2, how='outer')

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000.0
1,1001,둘리,20000.0
2,1001,둘리,3000.0
3,1002,도우너,
4,1003,또치,
5,1004,길동,
6,1005,희동,15000.0
7,1006,마이콜,5000.0
8,1007,영희,
9,1008,,100000.0


동일한 컬럼명이 여러개인 경우

In [179]:
df1 = pd.DataFrame({
    '고객명' : ['춘향',  '춘향', '몽룡'],
    '날짜' : ['2018-01-01', '2018-01-02', '2018-01-01'],
    '데이터' : ['20000','30000', '100000']})

In [180]:
df2 = pd.DataFrame({
    '고객명' : ['춘향',  '몽룡'],
    '데이터' : ['여자', '남자']})

In [181]:
pd.merge(df1, df2, on='고객명')

Unnamed: 0,고객명,날짜,데이터_x,데이터_y
0,춘향,2018-01-01,20000,여자
1,춘향,2018-01-02,30000,여자
2,몽룡,2018-01-01,100000,남자


동일한 컬럼명이 없는 경우

In [183]:
df1 = pd.DataFrame(
    {'이름' : ['영희', '철수', '철수'],
     "성적" : [1, 2, 3]})

In [182]:
df2 = pd.DataFrame(
    {'성명' : ['영희', '영희', '철수'],
     "성적2" : [4, 5, 6]})

In [184]:
pd.merge(df1, df2, left_on='이름', right_on='성명')

Unnamed: 0,이름,성적,성명,성적2
0,영희,1,영희,4
1,영희,1,영희,5
2,철수,2,철수,6
3,철수,3,철수,6


join

In [190]:
df1 = pd.DataFrame({
    "고객번호" : [1001,1002, 1003, 1004, 1005, 1006, 1007 ],
    "이름": ['둘리', '도우너', '또치', '길동', '희동', '마이콜', '영희']})
df1.set_index('고객번호', inplace=True)

In [191]:
df2 = pd.DataFrame({
    "고객번호" : [1001, 1001, 1005, 1006, 1008, 1001],
    "금액": [10000, 20000, 15000, 5000, 100000, 3000]})
df2.set_index('고객번호', inplace=True)

In [192]:
df1.join(df2, how='inner')

Unnamed: 0_level_0,이름,금액
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,둘리,10000
1001,둘리,20000
1001,둘리,3000
1005,희동,15000
1006,마이콜,5000


In [194]:
df1.join(df2) # left outer join

Unnamed: 0_level_0,이름,금액
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,둘리,10000.0
1001,둘리,20000.0
1001,둘리,3000.0
1002,도우너,
1003,또치,
1004,길동,
1005,희동,15000.0
1006,마이콜,5000.0
1007,영희,


concat 함수

In [196]:
s1 = pd.Series([0,1], index=["A", "B"])
s2 = pd.Series([2,3,4], index=["A", "B", "C"])

In [200]:
pd.concat([s1, s2])

A    0
B    1
A    2
B    3
C    4
dtype: int64

In [204]:
# 데이터 프레임 concate
df1 = pd.DataFrame(np.arange(6).reshape(3,2),
                   index=['a', 'b', 'c'],
                   columns=['데이터1', '데이터2'])

In [205]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2),
                   index=['a', 'c'],
                   columns=['데이터3', '데이터4'])

In [210]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,데이터1,데이터2,데이터3,데이터4
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


# 그룹분석
groupby 메소드

In [211]:
import seaborn as sns
iris = sns.load_dataset('iris')

In [214]:
iris.shape

(150, 5)

In [None]:
iris.groupby(iris.species).mean()

In [None]:
iris.groupby(iris.species).agg(['mean', 'std'])

내가 만든 함수 적용

In [216]:
def peak_to_peak(x):
    return x.max() / x.min()

In [None]:
iris.groupby(iris.species).agg(peak_to_peak)

In [None]:
iris.groupby(iris.species).agg(lambda x : x.max() / x.min())

In [None]:
iris.groupby(iris.species).apply(lambda x : x.max() / x.min())

In [None]:
mpg = sns.load_dataset('mpg')
mpg

In [222]:
mpg['manufacture'] = mpg.name.apply(lambda x : x.split()[0])
mpg['model'] = mpg.name.apply(lambda x : " ".join(x.split()[1:]))

In [None]:
mpg.head(2)

In [225]:
# iris데이터로 petal_length,  대, 중, 소로 분리 
def q3cut(s):
    return pd.qcut(s, 3, labels=['대', '중', '소']).astype(str)
    

In [None]:
iris['petal_length_class'] = iris.groupby(iris.species)['petal_length'].transform(q3cut)
iris[['petal_length', 'petal_length_class']] 

# pivot_table

In [228]:
tips = sns.load_dataset('tips')
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [229]:
tips['tip_pct'] = np.round(tips.tip / tips.total_bill * 100 ,2)

In [230]:
tips.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
239,29.03,5.92,Male,No,Sat,Dinner,3,20.39
240,27.18,2.0,Female,Yes,Sat,Dinner,2,7.36
241,22.67,2.0,Male,Yes,Sat,Dinner,2,8.82
242,17.82,1.75,Male,No,Sat,Dinner,2,9.82
243,18.78,3.0,Female,No,Thur,Dinner,2,15.97


In [None]:
tips.describe()

In [232]:
tips.groupby('sex').count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size,tip_pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Male,157,157,157,157,157,157,157
Female,87,87,87,87,87,87,87


In [233]:
tips.groupby('sex').size()

sex
Male      157
Female     87
dtype: int64

In [234]:
# 성별에 따른 평균 팁 비율
tips.groupby('sex')[['tip_pct']].mean()

Unnamed: 0_level_0,tip_pct
sex,Unnamed: 1_level_1
Male,15.764713
Female,16.648276


In [235]:
# 흡연 여부에 따른 평균 팁 비율
tips.groupby('smoker')[['tip_pct']].mean()

Unnamed: 0_level_0,tip_pct
smoker,Unnamed: 1_level_1
Yes,16.31914
No,15.932318


In [236]:
tips.groupby(['sex','smoker'])[['tip_pct']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct
sex,smoker,Unnamed: 2_level_1
Male,Yes,15.276667
Male,No,16.066598
Female,Yes,18.214545
Female,No,15.691111


In [238]:
# tips.pivot_table('tip_pct', ['sex','smoker'])
tips.pivot_table('tip_pct', 'sex','smoker')
# tips.groupby(['sex','smoker'])[['tip_pct']].mean()

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,15.276667,16.066598
Female,18.214545,15.691111
