# Pandas 소개

## 모듈 로딩

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

## 시리즈 클래스: Series
- Series = value + index
- numpy의 1차원 배열의 의미를 가진다.

### 시리즈 생성

In [2]:
# 2015년도에 각 도시의 인구 데이터를 시리즈 객체로 생성
s = pd.Series([9904312, 3448737, 2890451, 2466052],
             index=['서울', '부산', '인천', '대구'])
s

서울    9904312
부산    3448737
인천    2890451
대구    2466052
dtype: int64

In [3]:
# 인덱스를 지정하지 않으면 정수인덱스가 생성된다.
# 0부터 시작되는 정수 인덱스가 부여된다.
pd.Series(range(10, 14))

0    10
1    11
2    12
3    13
dtype: int64

In [4]:
# 속성: index, values, dtype
s.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

In [5]:
# 속성에 이름을 부여하는 name 속성
s.name = '인구'
s.index.name='도시'
s

도시
서울    9904312
부산    3448737
인천    2890451
대구    2466052
Name: 인구, dtype: int64

### 시리즈 연산
- 넘파이처럼 벡터화 연산을 수행한다.

In [6]:
s / 1000000

도시
서울    9.904312
부산    3.448737
인천    2.890451
대구    2.466052
Name: 인구, dtype: float64

### 시리즈 인덱싱
- 넘파이 배열의 인덱싱과 슬라이싱이 동일하다.

In [7]:
s[1], s['부산']

(3448737, 3448737)

In [8]:
s[3], s['대구']

(2466052, 2466052)

In [9]:
s[[0, 3, 1]]

도시
서울    9904312
대구    2466052
부산    3448737
Name: 인구, dtype: int64

In [10]:
s[['서울', '대구', '부산']]

도시
서울    9904312
대구    2466052
부산    3448737
Name: 인구, dtype: int64

In [11]:
# 조건식
s[s >= 3000000]

도시
서울    9904312
부산    3448737
Name: 인구, dtype: int64

In [12]:
# 인구가 250만 초과, 500만 미만
s[(s > 2500000) & (s < 5000000)]

도시
부산    3448737
인천    2890451
Name: 인구, dtype: int64

### 시리즈 슬라이싱

In [13]:
s[1:3]

도시
부산    3448737
인천    2890451
Name: 인구, dtype: int64

In [14]:
s['부산':'대구'] # 인덱스라벨링(문자) 되어있을 경우 마지막을 포함한다.

도시
부산    3448737
인천    2890451
대구    2466052
Name: 인구, dtype: int64

**라벨값이 영문 문자열일 경우 인덱스 라벨의 속성인 것처럼 점(.)을 이용하여 인덱스 값에 접근 가능**

In [15]:
s0 = pd.Series(np.arange(3), index=['a', 'b', 'c'])
s0

a    0
b    1
c    2
dtype: int32

In [16]:
s0[0], s0['a']

(0, 0)

In [17]:
s0.a

0

In [18]:
s0['c'], s0.c, s0[2]

(2, 2, 2)

**시리즈와 딕셔너리 자료형**
- 파이썬의 dictionary 자료형을 바로 판다스의 Series 객체로 변환이 가능
- 딕셔너리의 키는 시리즈의 index가 된다.
- 딕셔너리의 주요함수: get() == 사전['키], 사전.keys() => 시리즈객체명.index
- 사전명.values() => 시리즈객체명.values
- 사전명.items(): key와 value을 쌍으로 얻어오는 함수
- in 연산자와 not in 연산자

In [19]:
'서울' in s

True

In [20]:
'대전' in s

False

In [22]:
for key, value in s.items():
    print(key, value)

서울 9904312
부산 3448737
인천 2890451
대구 2466052


In [23]:
s2 = pd.Series({'서울':9631482, '부산':3393191, '인천':2632035, '대전':1490158})
s2

서울    9631482
부산    3393191
인천    2632035
대전    1490158
dtype: int64

In [24]:
ds = s - s2
ds

대구         NaN
대전         NaN
부산     55546.0
서울    272830.0
인천    258416.0
dtype: float64

- 대구와 대전의 경우는 두 시리즈에 공통의 인덱스로 존재하지 않기 때문에 연산을 수행할 수 없어 그 결과값으로 NaN(Not a Number) 값을 가진다.
- NaN값이 float 자료형에서만 적용되므로 연산의 결과가 모두 float형으로 바뀌었다.

In [25]:
# NaN이 아닌 값을 구하려면 notnull 메서드를 사용한다.
ds.notnull()

대구    False
대전    False
부산     True
서울     True
인천     True
dtype: bool

In [26]:
ds[ds.notnull()]

부산     55546.0
서울    272830.0
인천    258416.0
dtype: float64

In [27]:
# 2010년도 자료와 2015년도 자료를 이용하여 인구 증가율(%)을 계산하시오.
res = (s - s2) / s2 * 100
res = res[res.notnull()]
res

부산    1.636984
서울    2.832690
인천    9.818107
dtype: float64

### 데이터의 갱신, 추가, 삭제
- 인덱싱을 이용하면 딕셔너리 처럼 데이터를 갱신(update)하거나 추가(add)할 수 있다.

In [28]:
res['부산'] = 1.63
res

부산    1.630000
서울    2.832690
인천    9.818107
dtype: float64

In [29]:
res['대구'] = 1.41
res

부산    1.630000
서울    2.832690
인천    9.818107
대구    1.410000
dtype: float64

In [30]:
del res['서울']
res

부산    1.630000
인천    9.818107
대구    1.410000
dtype: float64

## 데이터프레임 클래스
- 시리즈는 1차원 배열과 같다.
- 데이터프레임은 2차원 배열(행렬, matrix)과 비슷하다.
- 행렬은 행 인덱스와 열 인덱스가 존재한다.

### 데이터프레임 생성
- 여러 개의 시리즈 객체가 묶어서 생성된다.
- 딕셔너리를 이용(키가 열 인덱스가 된다.)
- 속성: data, index, dtype, columns

In [31]:
data = {
    "2015": [9904312, 3448737, 2890451, 2466052],
    "2010": [9631482, 3393191, 2632035, 2431774],
    "2005": [9762546, 3512547, 2517680, 2456016],
    "2000": [9853972, 3655437, 2466338, 2473990],
    "지역": ["수도권", "경상권", "수도권", "경상권"],
    "2010-2015 증가율": [0.0283, 0.0163, 0.0982, 0.0141]
}
columns = ['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율']
index = ['서울', '부산', '인천', '대구']
df = pd.DataFrame(data, index=index, columns=columns)
df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


In [32]:
# 데이터 값에만 접근 values 속성
# 열 인덱스와 행 인덱스는 각각 columns, index 속성
df.values

array([['수도권', 9904312, 9631482, 9762546, 9853972, 0.0283],
       ['경상권', 3448737, 3393191, 3512547, 3655437, 0.0163],
       ['수도권', 2890451, 2632035, 2517680, 2466338, 0.0982],
       ['경상권', 2466052, 2431774, 2456016, 2473990, 0.0141]], dtype=object)

In [33]:
df.columns

Index(['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율'], dtype='object')

In [34]:
df.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

In [35]:
# 속성들에 이름을 부여할 때 name 속성
df.index.name='도시'
df.columns.name='특성'
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


In [37]:
# 전치(transpose)
df.T

도시,서울,부산,인천,대구
특성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2015,9904312,3448737,2890451,2466052
2010,9631482,3393191,2632035,2431774
2005,9762546,3512547,2517680,2456016
2000,9853972,3655437,2466338,2473990
2010-2015 증가율,0.0283,0.0163,0.0982,0.0141


### 열 데이터의 갱신, 추가, 삭제
- 열 하나가 시리즈 객체와 같다.

In [38]:
df['2010-2015 증가율'] = df['2010-2015 증가율'] * 100
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83
부산,경상권,3448737,3393191,3512547,3655437,1.63
인천,수도권,2890451,2632035,2517680,2466338,9.82
대구,경상권,2466052,2431774,2456016,2473990,1.41


In [40]:
# '2005-2010 증가율' 열 추가
df['2005-2010 증가율'] = (((df['2010'] - df['2005']) / df['2005']) *100).round(2)
df

특성,지역,2015,2010,2005,2000,2010-2015 증가율,2005-2010 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83,-1.34
부산,경상권,3448737,3393191,3512547,3655437,1.63,-3.4
인천,수도권,2890451,2632035,2517680,2466338,9.82,4.54
대구,경상권,2466052,2431774,2456016,2473990,1.41,-0.99


### 열 인덱싱
- 딕셔너리와 비슷
- 열이름이 딕셔너리의 키와 같다

In [41]:
# 데이터프레임에서 하나의 열만 추출하면 시리즈 객체로 반환된다.
df['지역']

도시
서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

In [42]:
# 여러 개의 열 인덱스로 추출하면 데이터프레임으로 반환된다
df[['2010', '2015']]

특성,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,9631482,9904312
부산,3393191,3448737
인천,2632035,2890451
대구,2431774,2466052


In [43]:
df[['지역']]

특성,지역
도시,Unnamed: 1_level_1
서울,수도권
부산,경상권
인천,수도권
대구,경상권


In [45]:
tb1 = pd.DataFrame({
    'weight':[80,50,69,34.5,45],
    'height':[190,170,178,146,150],
    'gender':['m', 'f', 'm', 'f', 'f']
})
tb1

Unnamed: 0,weight,height,gender
0,80.0,190,m
1,50.0,170,f
2,69.0,178,m
3,34.5,146,f
4,45.0,150,f


In [46]:
type(tb1)

pandas.core.frame.DataFrame

In [47]:
tb1[tb1['height'] >= 160]

Unnamed: 0,weight,height,gender
0,80.0,190,m
1,50.0,170,f
2,69.0,178,m


In [48]:
tb1[tb1.height >= 160]

Unnamed: 0,weight,height,gender
0,80.0,190,m
1,50.0,170,f
2,69.0,178,m


In [49]:
# 값을 대상으로 오름차순 정렬, ascending=True 생략된 개념
tb1.sort_values('height')

Unnamed: 0,weight,height,gender
3,34.5,146,f
4,45.0,150,f
1,50.0,170,f
2,69.0,178,m
0,80.0,190,m


In [50]:
tb1.sort_values('height', ascending=False)  # 내림차순

Unnamed: 0,weight,height,gender
0,80.0,190,m
2,69.0,178,m
1,50.0,170,f
4,45.0,150,f
3,34.5,146,f


In [51]:
tb1.keys

<bound method NDFrame.keys of    weight  height gender
0    80.0     190      m
1    50.0     170      f
2    69.0     178      m
3    34.5     146      f
4    45.0     150      f>

In [52]:
tb1.columns

Index(['weight', 'height', 'gender'], dtype='object')

In [53]:
tb1.index

RangeIndex(start=0, stop=5, step=1)

In [54]:
tb1.values

array([[80.0, 190, 'm'],
       [50.0, 170, 'f'],
       [69.0, 178, 'm'],
       [34.5, 146, 'f'],
       [45.0, 150, 'f']], dtype=object)

In [55]:
data={
    "names": ["김철수","이철호","김영희","박민수","송철호"],
    "year": [2014,2015,2016,2017,2018],
    "points": [1.5, 1.7, 3.6, 2.4, 2.9]
}
df2 = pd.DataFrame(data, 
                  columns=['year', 'names', 'points', 'penalty'],
                  index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,
two,2015,이철호,1.7,
three,2016,김영희,3.6,
four,2017,박민수,2.4,
five,2018,송철호,2.9,


In [56]:
# 결측값: Na, NaN
# 보간법: 대체법, fillna(값)
df3 = df2.fillna(0)
df3

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,0
two,2015,이철호,1.7,0
three,2016,김영희,3.6,0
four,2017,박민수,2.4,0
five,2018,송철호,2.9,0


In [57]:
# 숫자 데이터의 기초 통계량을 추출하는 함수
df3.describe()
# count, mean, std, min, max, 사분위수

Unnamed: 0,year,points,penalty
count,5.0,5.0,5.0
mean,2016.0,2.42,0.0
std,1.581139,0.864292,0.0
min,2014.0,1.5,0.0
25%,2015.0,1.7,0.0
50%,2016.0,2.4,0.0
75%,2017.0,2.9,0.0
max,2018.0,3.6,0.0


In [58]:
# 데이터프레임에 대한 정보 추출'
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, one to five
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   year     5 non-null      int64  
 1   names    5 non-null      object 
 2   points   5 non-null      float64
 3   penalty  5 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 200.0+ bytes


In [59]:
df3['penalty'] = [0.1,0.2,00.3,0.4,0.5]
df3

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,0.1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


In [60]:
df3['ages'] = np.arange(10, 15)
df3

Unnamed: 0,year,names,points,penalty,ages
one,2014,김철수,1.5,0.1,10
two,2015,이철호,1.7,0.2,11
three,2016,김영희,3.6,0.3,12
four,2017,박민수,2.4,0.4,13
five,2018,송철호,2.9,0.5,14


In [61]:
del df3['ages']

In [63]:
df3

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,0.1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


In [64]:
df3[0:3]

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,0.1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3


In [66]:
df3.loc['two']  # 라벨링 되어 있는 행 인덱스 추출을 위한 .loc

year       2015
names       이철호
points      1.7
penalty     0.2
Name: two, dtype: object

In [67]:
df3.loc['two':'four']

Unnamed: 0,year,names,points,penalty
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4


In [68]:
df3.loc['two' : 'four', 'points']

two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [69]:
# 전체에서 특정 필드(year, names) 추출
df3[['year', 'names']]

Unnamed: 0,year,names
one,2014,김철수
two,2015,이철호
three,2016,김영희
four,2017,박민수
five,2018,송철호


In [70]:
df3.loc[:, ['year', 'names']]

Unnamed: 0,year,names
one,2014,김철수
two,2015,이철호
three,2016,김영희
four,2017,박민수
five,2018,송철호


In [71]:
df3.loc['four', 'names']

'박민수'

In [72]:
df3.iloc[3]  # 숫자 인덱스로 추출할 때 .iloc

year       2017
names       박민수
points      2.4
penalty     0.4
Name: four, dtype: object

In [73]:
df3.iloc[3:5, 0:2]

Unnamed: 0,year,names
four,2017,박민수
five,2018,송철호


In [74]:
df3.iloc[1,1]

'이철호'

In [78]:
df3[df3['year'] >= 2015]

Unnamed: 0,year,names,points,penalty
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


In [79]:
df3.loc[df3['year'] >=2015, :]

Unnamed: 0,year,names,points,penalty
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


In [80]:
df3.loc[df3.year >= 2015, :]

Unnamed: 0,year,names,points,penalty
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


### 데이터프레임 조작

In [88]:
df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,0.059631,-0.526357,2.544305,1.362406
1,-0.563498,-1.754153,0.345551,0.248332
2,0.029207,-0.528269,-0.574095,1.670678
3,0.33969,-0.267972,-0.818582,0.310584
4,0.970612,2.003373,0.560018,-0.544538
5,-0.365739,-0.581061,-1.687068,0.179679


In [89]:
df.columns = ['A', 'B', 'C', 'D']
df

Unnamed: 0,A,B,C,D
0,0.059631,-0.526357,2.544305,1.362406
1,-0.563498,-1.754153,0.345551,0.248332
2,0.029207,-0.528269,-0.574095,1.670678
3,0.33969,-0.267972,-0.818582,0.310584
4,0.970612,2.003373,0.560018,-0.544538
5,-0.365739,-0.581061,-1.687068,0.179679


In [90]:
df.index = pd.date_range('20230926', periods=6)
df

Unnamed: 0,A,B,C,D
2023-09-26,0.059631,-0.526357,2.544305,1.362406
2023-09-27,-0.563498,-1.754153,0.345551,0.248332
2023-09-28,0.029207,-0.528269,-0.574095,1.670678
2023-09-29,0.33969,-0.267972,-0.818582,0.310584
2023-09-30,0.970612,2.003373,0.560018,-0.544538
2023-10-01,-0.365739,-0.581061,-1.687068,0.179679


In [84]:
# 삭제: del, drop
# axis = 0: 행 / axis = 1: 열
df.drop('D', axis = 1) # 삭제된 결과가 저장되지 않고 보여지기만 함

Unnamed: 0,A,B,C
2023-09-26,1.380691,-0.670735,-0.384409
2023-09-27,0.621493,1.338711,0.531217
2023-09-28,1.453497,-0.024863,0.069244
2023-09-29,0.981592,-0.387896,-0.350485
2023-09-30,0.385764,-0.770088,-0.866062
2023-10-01,0.806124,-0.781633,-1.038547


In [86]:
df

Unnamed: 0,A,B,C,D
2023-09-26,1.380691,-0.670735,-0.384409,-0.288815
2023-09-27,0.621493,1.338711,0.531217,-1.74762
2023-09-28,1.453497,-0.024863,0.069244,2.758563
2023-09-29,0.981592,-0.387896,-0.350485,-0.691862
2023-09-30,0.385764,-0.770088,-0.866062,0.747255
2023-10-01,0.806124,-0.781633,-1.038547,-0.692377


In [87]:
del df['D']  # 바로 삭제되어 결과가 저장 됨
df

Unnamed: 0,A,B,C
2023-09-26,1.380691,-0.670735,-0.384409
2023-09-27,0.621493,1.338711,0.531217
2023-09-28,1.453497,-0.024863,0.069244
2023-09-29,0.981592,-0.387896,-0.350485
2023-09-30,0.385764,-0.770088,-0.866062
2023-10-01,0.806124,-0.781633,-1.038547


In [91]:
df

Unnamed: 0,A,B,C,D
2023-09-26,0.059631,-0.526357,2.544305,1.362406
2023-09-27,-0.563498,-1.754153,0.345551,0.248332
2023-09-28,0.029207,-0.528269,-0.574095,1.670678
2023-09-29,0.33969,-0.267972,-0.818582,0.310584
2023-09-30,0.970612,2.003373,0.560018,-0.544538
2023-10-01,-0.365739,-0.581061,-1.687068,0.179679


In [92]:
row = df  # 사본

In [93]:
row

Unnamed: 0,A,B,C,D
2023-09-26,0.059631,-0.526357,2.544305,1.362406
2023-09-27,-0.563498,-1.754153,0.345551,0.248332
2023-09-28,0.029207,-0.528269,-0.574095,1.670678
2023-09-29,0.33969,-0.267972,-0.818582,0.310584
2023-09-30,0.970612,2.003373,0.560018,-0.544538
2023-10-01,-0.365739,-0.581061,-1.687068,0.179679


In [94]:
row.drop(['B', 'D'], axis=1)

Unnamed: 0,A,C
2023-09-26,0.059631,2.544305
2023-09-27,-0.563498,0.345551
2023-09-28,0.029207,-0.574095
2023-09-29,0.33969,-0.818582
2023-09-30,0.970612,0.560018
2023-10-01,-0.365739,-1.687068


In [95]:
row

Unnamed: 0,A,B,C,D
2023-09-26,0.059631,-0.526357,2.544305,1.362406
2023-09-27,-0.563498,-1.754153,0.345551,0.248332
2023-09-28,0.029207,-0.528269,-0.574095,1.670678
2023-09-29,0.33969,-0.267972,-0.818582,0.310584
2023-09-30,0.970612,2.003373,0.560018,-0.544538
2023-10-01,-0.365739,-0.581061,-1.687068,0.179679


## 데이터 입출력
- 다양한 파일 포맷을 지원한다.
- csv(쉼표로 구분 된), Excel, HTML, JSON, HDF5, SAS, STATA, SQL

In [96]:
%%writefile sample1.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample1.csv


### csv 파일 로딩
- pd.read_csv()

In [97]:
pd.read_csv('sample1.csv')  # 행 인덱스 자동 생성

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [98]:
%%writefile sample2.csv   # 컬럼명이 없는 csv 파일 생성
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample2.csv


In [100]:
pd.read_csv('sample2.csv', names=['c1', 'c2', 'c3'])  #컬럼명을 부여하지 않으면 첫 행이 컬럼명이 된다.

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [101]:
pd.read_csv('sample1.csv', index_col='c1')  # c1을 행인덱스로 사용하기

Unnamed: 0_level_0,c2,c3
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.11,one
2,2.22,two
3,3.33,three


In [104]:
%%writefile sample3.txt
c1        c2         c3        c4
0.179181  -1.538472  1.347553  0.43381
1.024209  0.087307  -1.281997  0.49265
0.417899  -2.002308  0.255245  -1.10515

Writing sample3.txt


In [106]:
pd.read_table('sample3.txt', sep='\s+')

Unnamed: 0,c1,c2,c3,c4
0,0.179181,-1.538472,1.347553,0.43381
1,1.024209,0.087307,-1.281997,0.49265
2,0.417899,-2.002308,0.255245,-1.10515


In [107]:
%%writefile sample4.txt
파일 제목: sample4.txt
데이터 포맷의 설명:
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

Writing sample4.txt


In [108]:
# skiprows: 해당 행을 가져오지 않음
pd.read_csv('sample4.txt', skiprows=[0,1])  # txt 파일이지만 구분기호를 쉼표를 썼기 때문에 csv파일로 읽어올 수 있다.

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [109]:
%%writefile sample5.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
누락, 3.33, three

Writing sample5.csv


In [110]:
# na_values: NaN으로 취급할 값을 담는 옵션
df = pd.read_csv('sample5.csv', na_values=['누락'])
df

Unnamed: 0,c1,c2,c3
0,1.0,1.11,one
1,2.0,2.22,two
2,,3.33,three


### csv 파일로 저장

In [111]:
df.to_csv('sample6.csv')

### 인터넷 상에 저장된 csv 파일 로딩

In [112]:
titanic = pd.read_csv('https://raw.githubusercontent.com/datascienceschool/docker_rpython/master/data/titanic.csv')

In [113]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [114]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [115]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [116]:
# head, tail
titanic.head()  # default = 5

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [117]:
titanic.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
881,882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## 고급 인덱싱
- 데이터프레임에서 특정 데이터만 추출하는 기능을 인덱싱이라고 부른다.
- 정수인덱싱, 라벨인덱싱
- [행인덱스, 열인덱스]
- loc: 라벨인덱싱
- iloc: 정수인덱싱

In [119]:
df = pd.DataFrame(np.arange(10, 22).reshape(3,4),
                  index = ['a', 'b', 'c'],
                  columns = ['A', 'B', 'C', 'D']
                 )
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [120]:
# 하나의 행 인덱스를 사용하는 경우
df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [121]:
df.loc['b':'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [122]:
# 행인덱스에 슬라이싱을 사용할 때는 loc 생략가능
df['b':'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [123]:
df.loc[['b', 'c']]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [124]:
df['A']

a    10
b    14
c    18
Name: A, dtype: int32

In [125]:
df.A

a    10
b    14
c    18
Name: A, dtype: int32

In [126]:
df[df.A > 15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [128]:
df.loc[df.A > 15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [130]:
df['A']

a    10
b    14
c    18
Name: A, dtype: int32

In [131]:
df[['A', 'B']]

Unnamed: 0,A,B
a,10,11
b,14,15
c,18,19


In [132]:
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [133]:
df.loc['a','A']  # 행 인덱싱이 들어가기 때문에 loc 필수

10

In [135]:
df.loc['b':, 'A']

b    14
c    18
Name: A, dtype: int32

In [137]:
df.iloc[0,1]  # (0,1)

11

### 주요함수

In [138]:
# 데이터프레임의 데이터 갯수: count
s = pd.Series(range(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [139]:
s.count()

10

In [140]:
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [141]:
s.count()  # 결측값을 제외한 개수

9

In [142]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(5, size = (4,4)), dtype=float)
df.iloc[2, 3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [143]:
df.count()  # 각 컬럼에 대한 행의 수(결측값 제외)

0    4
1    4
2    4
3    3
dtype: int64

In [144]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [145]:
# 타이타닉호의 승객 데이터를 활용하여 데이터의 개수를 추출하되, 각 열마다 추출하시오.
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

**value_counts(), 개별 값의 건수를 추출하는 함수**

In [147]:
s2 = pd.Series(np.random.randint(6, size=100))
s2.tail()

95    3
96    0
97    0
98    2
99    5
dtype: int32

In [148]:
# 0,1,2,3,4,5 각 값의 개수
s2.value_counts()

2    22
0    18
3    17
1    15
5    15
4    13
dtype: int64

In [149]:
titanic['survived'].value_counts()  # 0:사망  1:생존

0    549
1    342
Name: survived, dtype: int64

In [150]:
titanic['pclass'].value_counts()

3    491
1    216
2    184
Name: pclass, dtype: int64

**정렬**
- sort_index: 인덱스 값을 기준으로 정렬
- sort_values: 값을 기준으로 정렬

In [151]:
s2.value_counts() # 값에 대한 내림차순 정렬됨

2    22
0    18
3    17
1    15
5    15
4    13
dtype: int64

In [152]:
s2.value_counts().sort_index() # 인덱스값에 대한 오름차순 정렬

0    18
1    15
2    22
3    17
4    13
5    15
dtype: int64

In [153]:
s.sort_values()  # 결측값이 있을 경우 가장 마지막에 결측값 

0    0.0
1    1.0
2    2.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
3    NaN
dtype: float64

In [154]:
s.sort_values(ascending=False)

9    9.0
8    8.0
7    7.0
6    6.0
5    5.0
4    4.0
2    2.0
1    1.0
0    0.0
3    NaN
dtype: float64

In [155]:
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [156]:
df.sort_values(by=2)

Unnamed: 0,0,1,2,3
1,3.0,0.0,2.0,1.0
0,0.0,0.0,3.0,2.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [157]:
df.sort_values(by = [1, 2]) # 열 1을 기준으로 오름차순 정렬하되 값이 같은 경우 열 2를 기준으로 정렬

Unnamed: 0,0,1,2,3
1,3.0,0.0,2.0,1.0
0,0.0,0.0,3.0,2.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [159]:
# sort_values 메서드를 사용하여 타이타닉호 승객에 대한 성별(sex) 인원수, 나이별(age) 인원 수,
# 선실별(class)인원수, 생존/사망(alive)인원 수를 추출하시오.
titanic['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [164]:
titanic['age'].value_counts().sort_index()

0.42     1
0.67     1
0.75     2
0.83     2
0.92     1
        ..
70.00    2
70.50    1
71.00    2
74.00    1
80.00    1
Name: age, Length: 88, dtype: int64

In [161]:
titanic['class'].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

In [162]:
titanic['alive'].value_counts()

no     549
yes    342
Name: alive, dtype: int64

### 행/열 합계: sum(axis = 1(행), axis = 0(열))

In [165]:
df3 = pd.DataFrame(np.random.randint(10, size=(4, 8)))
df3

Unnamed: 0,0,1,2,3,4,5,6,7
0,4,2,9,7,1,9,2,1
1,0,7,1,8,9,0,7,0
2,5,2,5,1,3,3,1,8
3,6,8,1,5,7,0,9,1


In [166]:
# 행방향 합계
df3.sum(axis=1)

0    35
1    32
2    28
3    37
dtype: int64

In [167]:
# 열방향 합계
df3.sum(axis=0)

0    15
1    19
2    16
3    21
4    20
5    12
6    19
7    10
dtype: int64

In [168]:
df3.sum()  # axis 생략 시 열방향 계산(axis = 0)

0    15
1    19
2    16
3    21
4    20
5    12
6    19
7    10
dtype: int64

In [169]:
df3['RowSum'] = df3.sum(axis=1)
df3

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,4,2,9,7,1,9,2,1,35
1,0,7,1,8,9,0,7,0,32
2,5,2,5,1,3,3,1,8,28
3,6,8,1,5,7,0,9,1,37


In [170]:
df3.loc['ColSum'] = df3.sum()

In [171]:
df3

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,4,2,9,7,1,9,2,1,35
1,0,7,1,8,9,0,7,0,32
2,5,2,5,1,3,3,1,8,28
3,6,8,1,5,7,0,9,1,37
ColSum,15,19,16,21,20,12,19,10,132


1. 타이타닉호 승객의 평균 나이는?
2. 타이타닉호 승객 중 여성 승객의 평균 나이는?
3. 타이타닉호 승객 중 1등실에 있는 여성 승객의 평균 나이는?

In [178]:
# 타이타닉호 승객의 평균나이
round(titanic['age'].mean(), 0)

30.0

In [179]:
#2. 타이타닉호 승객 중 여성 승객의 평균 나이
round(titanic[titanic['sex'] == 'female']['age'].mean(), 0)

28.0

In [181]:
#3. 1등실 여성 승객의 평균나이
round(titanic[(titanic['sex'] == 'female') & (titanic['class'] == 'First')]['age'].mean(), 0)

35.0

### apply 변환
- 행 또는 열 단위로 반복해서 특정 함수를 적용하는 기능
- apply(함수, axis)
- lambda 함수로 사용

In [182]:
data = pd.DataFrame({
    'A':[1,2,3,4,3],
    'B':[2,3,1,2,4],
    'C':[5,4,3,2,1]
})
data

Unnamed: 0,A,B,C
0,1,2,5
1,2,3,4
2,3,1,3
3,4,2,2
4,3,4,1


In [183]:
# 각 열의 최대값과 최소값의 차이
data['A'].max() - data['A'].min()

3

In [184]:
# lambda 입력값: 출력값
data.apply(lambda x : x.max() - x.min())

A    3
B    3
C    4
dtype: int64

In [185]:
data.apply(lambda x : x.max() - x.min(), axis = 1)

0    4
1    2
2    2
3    2
4    3
dtype: int64

In [186]:
# 각 열에 어떤 값이 얼마나 사용되었는지를 알고 싶다면
data.apply(pd.value_counts)

Unnamed: 0,A,B,C
1,1.0,1.0,1
2,1.0,2.0,1
3,2.0,1.0,1
4,1.0,1.0,1
5,,,1


In [187]:
# adult/child를 판정하여 컬럼(adult/child)를 추가하시오
# 승객의 나이가 20살 이상이면 성인(adult) 그렇지 않으면 미성년(child)으로 표시할 것

In [188]:
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [189]:
titanic['adult/child'] = titanic.apply(lambda x : 'adult' if x.age >= 20 else 'child', axis=1)
titanic.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,adult/child
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True,adult
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True,child
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False,child
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True,adult
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,adult


In [190]:
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [191]:
df.apply(pd.value_counts).fillna(0.0)

Unnamed: 0,0,1,2,3
0.0,1.0,2.0,0.0,0.0
1.0,0.0,0.0,0.0,1.0
2.0,0.0,1.0,1.0,2.0
3.0,2.0,1.0,1.0,0.0
4.0,1.0,0.0,2.0,0.0


In [192]:
df.apply(pd.value_counts).fillna(0).astype(int)

Unnamed: 0,0,1,2,3
0.0,1,2,0,0
1.0,0,0,0,1
2.0,0,1,1,2
3.0,2,1,1,0
4.0,1,0,2,0


## 데이터프레임의 인덱스 조작

### 데이터프레임의 인덱스 설정 및 제거
- set_index: 기존의 행 인덱스를 제거하고 데이터의 열 중 하나를 인덱스로 설정
- reset_index: 기존의 행 인덱스를 제거하고 인덱스를 데이터 열로 추가

In [195]:
df1 = pd.DataFrame(np.vstack([list('ABCDE'),
                              np.round(np.random.rand(3, 5), 2)]).T,
                   columns = ['C1', 'C2', 'C3', 'C4'])
df1

Unnamed: 0,C1,C2,C3,C4
0,A,0.3,0.29,0.05
1,B,0.19,0.47,0.23
2,C,0.1,0.55,0.64
3,D,0.28,0.85,0.16
4,E,0.21,0.99,0.87


In [196]:
df2 = df1.set_index('C1')
df2

Unnamed: 0_level_0,C2,C3,C4
C1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.3,0.29,0.05
B,0.19,0.47,0.23
C,0.1,0.55,0.64
D,0.28,0.85,0.16
E,0.21,0.99,0.87


In [197]:
df2.reset_index()  # 행 인덱스를 열 인덱스로 바꿈

Unnamed: 0,C1,C2,C3,C4
0,A,0.3,0.29,0.05
1,B,0.19,0.47,0.23
2,C,0.1,0.55,0.64
3,D,0.28,0.85,0.16
4,E,0.21,0.99,0.87


In [198]:
df2.reset_index(drop=True)  # 바뀌지않고 그대로 삭제

Unnamed: 0,C2,C3,C4
0,0.3,0.29,0.05
1,0.19,0.47,0.23
2,0.1,0.55,0.64
3,0.28,0.85,0.16
4,0.21,0.99,0.87


### 데이터프레임 합성
- merge(병합): 두 데이터프레임의 공통의 열 또는 행 인덱스를 기준으로 데이터를 합침
- concat(연결)

In [199]:
df1 = pd.DataFrame({
    '고객번호':[1001, 1002, 1003, 1004, 1005, 1006, 1007],
    '이름':['이소라', '박민수', '둘리', '고길동', '도우너', '또치', '희동']
})
df1

Unnamed: 0,고객번호,이름
0,1001,이소라
1,1002,박민수
2,1003,둘리
3,1004,고길동
4,1005,도우너
5,1006,또치
6,1007,희동


In [200]:
df2 = pd.DataFrame({
    '고객번호':[1001, 1001, 1005, 1006, 1008, 1001],
    '금액':[10000, 200000, 15000, 50000, 120000, 30000]
})
df2

Unnamed: 0,고객번호,금액
0,1001,10000
1,1001,200000
2,1005,15000
3,1006,50000
4,1008,120000
5,1001,30000


In [201]:
# 두 데이터프레임에 공통의 키가 존재하는 데이터만 합치는 것: inner join
pd.merge(df1, df2)

Unnamed: 0,고객번호,이름,금액
0,1001,이소라,10000
1,1001,이소라,200000
2,1001,이소라,30000
3,1005,도우너,15000
4,1006,또치,50000


In [202]:
# 키 값이 한 쪽에만 있어도 추출하는 것: outer join, 외부조인
# left outer join, right outer join
pd.merge(df1, df2, how='outer')  # outer: 양쪽 다 나옴

Unnamed: 0,고객번호,이름,금액
0,1001,이소라,10000.0
1,1001,이소라,200000.0
2,1001,이소라,30000.0
3,1002,박민수,
4,1003,둘리,
5,1004,고길동,
6,1005,도우너,15000.0
7,1006,또치,50000.0
8,1007,희동,
9,1008,,120000.0


In [203]:
pd.merge(df1, df2, how='left')  # left: 왼쪽 다 나옴

Unnamed: 0,고객번호,이름,금액
0,1001,이소라,10000.0
1,1001,이소라,200000.0
2,1001,이소라,30000.0
3,1002,박민수,
4,1003,둘리,
5,1004,고길동,
6,1005,도우너,15000.0
7,1006,또치,50000.0
8,1007,희동,


In [204]:
pd.merge(df1, df2, how='right')  # right: 오른쪽 다 나옴

Unnamed: 0,고객번호,이름,금액
0,1001,이소라,10000
1,1001,이소라,200000
2,1005,도우너,15000
3,1006,또치,50000
4,1008,,120000
5,1001,이소라,30000


**join 메서드**

In [205]:
df3 = pd.DataFrame([[1,2], [3,4], [5,6]],
                  index = ['a', 'c', 'e'],
                  columns = ['서울', '부산'])
df3

Unnamed: 0,서울,부산
a,1,2
c,3,4
e,5,6


In [207]:
df4 = pd.DataFrame([[7, 8], [9, 10], [11, 12], [13, 14]],
                  index=['b', 'c', 'd', 'e'],
                  columns = ['대구', '광주'])
df4

Unnamed: 0,대구,광주
b,7,8
c,9,10
d,11,12
e,13,14


In [209]:
pd.merge(df3, df4, how = 'outer', left_index = True, right_index = True)

Unnamed: 0,서울,부산,대구,광주
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [210]:
df3.join(df4, how='outer')

Unnamed: 0,서울,부산,대구,광주
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [211]:
s1 = pd.Series([0, 1], index=['A', 'B'])
s2 = pd.Series([2,3,4], index = ['A', 'B', 'C'])

In [212]:
s1

A    0
B    1
dtype: int64

In [213]:
s2

A    2
B    3
C    4
dtype: int64

In [214]:
pd.concat([s1,s2])

A    0
B    1
A    2
B    3
C    4
dtype: int64

## 그룹분석
- 특정 키를 조건으로 그룹을 설정하여 그룹에 특성을 이용하여 계산을 수행하는 기능
- groupby 메서드를 이용

**그룹연산 메서드**
- size, count: 그룹 데이터의 갯수
- mean, median, min, max
- sum, prod, std, var, quantile
- first, last

**그룹연산 메서드**
- agg, aggreate: 그룹에 동시에 적용할 함수를 리스트로 전달
- describe: 기초 통계량
- apply: 특정 함수 반복
- transform: 그룹별 계산

In [215]:
df = pd.DataFrame({
    'key1':['A','A', 'B', 'B', 'A'],
    'key2':['one', 'two', 'three', 'four', 'five'],
    'data1':[1,2,3,4,5],
    'data2':[10, 20, 30, 40, 50]
})
df

Unnamed: 0,key1,key2,data1,data2
0,A,one,1,10
1,A,two,2,20
2,B,three,3,30
3,B,four,4,40
4,A,five,5,50


In [216]:
# A그룹과 B그룹으로 구분
groups = df.groupby(df.key1)  # 주소가 담김
groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000232239B47F0>

In [217]:
# groups 속성: 각 그룹별 데이터의 인덱스가 저장됨
groups.groups

{'A': [0, 1, 4], 'B': [2, 3]}

In [218]:
groups.sum()

  groups.sum()


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8,80
B,7,70


In [219]:
df.groupby(df.key1).sum()

  df.groupby(df.key1).sum()


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8,80
B,7,70


In [220]:
df['data1'].groupby(df.key1).sum()

key1
A    8
B    7
Name: data1, dtype: int64

In [221]:
df.groupby(df.key1)['data1'].sum()

key1
A    8
B    7
Name: data1, dtype: int64

In [222]:
df.data1.groupby([df.key1, df.key2]).sum()

key1  key2 
A     five     5
      one      1
      two      2
B     four     4
      three    3
Name: data1, dtype: int64

# tips
- 식당에서 식사를 한 후 내는 팁과 관련된 데이터를 제공하는 데이터 셋
- seaborn 라이브러리에서 제공하는 데이터셋

In [223]:
import seaborn as sns
tips=sns.load_dataset('tips')
tips.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


- total: 총 결제 금액
- tip: 팁
- sex: 결제한 사람의 성별
- smoker: 흡연여부
- day: 요일
- time: 시간대
- size: 인원 수

In [224]:
# 결제금액 대비 팁의 비율
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [225]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   tip_pct     244 non-null    float64 
dtypes: category(4), float64(3), int64(1)
memory usage: 9.3 KB


In [226]:
tips.describe()

Unnamed: 0,total_bill,tip,size,tip_pct
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,0.160803
std,8.902412,1.383638,0.9511,0.061072
min,3.07,1.0,1.0,0.035638
25%,13.3475,2.0,2.0,0.129127
50%,17.795,2.9,2.0,0.15477
75%,24.1275,3.5625,3.0,0.191475
max,50.81,10.0,6.0,0.710345


In [227]:
tips.groupby('sex').count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size,tip_pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Male,157,157,157,157,157,157,157
Female,87,87,87,87,87,87,87


In [228]:
tips.groupby('sex').size()

sex
Male      157
Female     87
dtype: int64

In [230]:
# 성별에 따른 흡연 여부
tips.groupby(['sex','smoker']).size()

sex     smoker
Male    Yes       60
        No        97
Female  Yes       33
        No        54
dtype: int64

In [231]:
# 성별에 따른 팁 비율
tips.groupby('sex')['tip_pct'].mean()

sex
Male      0.157651
Female    0.166491
Name: tip_pct, dtype: float64

In [233]:
# 흡연 여부에 따른 팁 비율
tips.groupby('smoker')[['tip_pct']].mean()

Unnamed: 0_level_0,tip_pct
smoker,Unnamed: 1_level_1
Yes,0.163196
No,0.159328


In [234]:
tips['day'].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [235]:
tips['time'].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [238]:
# 팁을 가장 많이 낸 손님이 온 요일 과 시간
tips[tips['tip'] == tips['tip'].max()][['tip','day', 'time']]

Unnamed: 0,tip,day,time
170,10.0,Sat,Dinner


In [240]:
# 요일별 팁 비율 분석
tips.groupby('day')[['tip_pct']].describe().T

Unnamed: 0,day,Thur,Fri,Sat,Sun
tip_pct,count,62.0,19.0,87.0,76.0
tip_pct,mean,0.161276,0.169913,0.153152,0.166897
tip_pct,std,0.038652,0.047665,0.051293,0.084739
tip_pct,min,0.072961,0.103555,0.035638,0.059447
tip_pct,25%,0.13821,0.133739,0.123863,0.119982
tip_pct,50%,0.153846,0.155625,0.151832,0.161103
tip_pct,75%,0.192687,0.196637,0.188271,0.187889
tip_pct,max,0.266312,0.26348,0.325733,0.710345


In [241]:
# 시간대 별 팁 비율 분석
tips.groupby('time')[['tip_pct']].describe().T

Unnamed: 0,time,Lunch,Dinner
tip_pct,count,68.0,176.0
tip_pct,mean,0.164128,0.159518
tip_pct,std,0.040242,0.067477
tip_pct,min,0.072961,0.035638
tip_pct,25%,0.139147,0.123192
tip_pct,50%,0.154084,0.1554
tip_pct,75%,0.193917,0.188209
tip_pct,max,0.266312,0.710345


In [242]:
# 인원 수 별 팁 비율
tips.groupby('size')[['tip_pct']].describe().T

Unnamed: 0,size,1,2,3,4,5,6
tip_pct,count,4.0,156.0,38.0,37.0,5.0,4.0
tip_pct,mean,0.217292,0.165719,0.152157,0.145949,0.141495,0.156229
tip_pct,std,0.080342,0.066848,0.045459,0.042395,0.067733,0.042153
tip_pct,min,0.137931,0.035638,0.056433,0.077459,0.06566,0.103799
tip_pct,25%,0.170779,0.135223,0.124758,0.11775,0.106572,0.131654
tip_pct,50%,0.202752,0.156104,0.159323,0.146699,0.121389,0.162891
tip_pct,75%,0.249265,0.195036,0.186135,0.169797,0.172194,0.187466
tip_pct,max,0.325733,0.710345,0.230742,0.280535,0.241663,0.195335


In [245]:
# 요일별 점심시간 평균 팁 비율
tips[tips['time'] == 'Lunch'].groupby('day')[['tip_pct']].describe().T

Unnamed: 0,day,Thur,Fri,Sat,Sun
tip_pct,count,61.0,7.0,0.0,0.0
tip_pct,mean,0.161301,0.188765,,
tip_pct,std,0.038972,0.045885,,
tip_pct,min,0.072961,0.117735,,
tip_pct,25%,0.137741,0.167289,,
tip_pct,50%,0.153846,0.187735,,
tip_pct,75%,0.193424,0.210996,,
tip_pct,max,0.266312,0.259314,,
