# **1. 판다스(Pandas)**
판다스(Pandas)는 데이터 분석을 위한 파이썬 라이브러리 중 하나로, 표 형태의 데이터나 다양한 형태의 데이터를 쉽게 처리하고 분석할 수 있도록 도와주는 도구입니다. 주로 데이터프레임(DataFrame)이라는 자료구조를 제공하며, 이를 통해 테이블 형태의 데이터를 다루기 용이합니다.

```
pip install pandas
```

In [221]:
import pandas as pd

# **2. Series와 DataFrame**

### 1. Series

Series는 1차원 배열과 같은 자료구조로 하나의 열을 나타냅니다. 또한 각 요소는 인덱스(index)와 값(value)으로 구성되어 있습니다. 값은 넘파이의 ndarray 기반으로 저장됩니다. Series는 다양한 데이터 타입을 가질 수 있으며 정수, 실수, 문자열 등 다양한 형태의 데이터를 담을 수 있습니다.


In [222]:
idx = ['김사과', '반하나', '오렌지', '이메론', '배애리']
data = [67, 75, 90, 62, 98]

pd.Series(data)

Unnamed: 0,0
0,67
1,75
2,90
3,62
4,98


In [223]:
pd.Series(data, idx)


Unnamed: 0,0
김사과,67
반하나,75
오렌지,90
이메론,62
배애리,98


In [224]:
pd.Series(idx, data)

Unnamed: 0,0
67,김사과
75,반하나
90,오렌지
62,이메론
98,배애리


In [225]:
se1 = pd.Series(data, idx)
se1

Unnamed: 0,0
김사과,67
반하나,75
오렌지,90
이메론,62
배애리,98


In [226]:
print(se1.index)
print(se1.values)
print(type(se1.values))

Index(['김사과', '반하나', '오렌지', '이메론', '배애리'], dtype='object')
[67 75 90 62 98]
<class 'numpy.ndarray'>


### 2. DataFrame

데이터프레임(DataFrame)은 판다스(Pandas) 라이브러리에서 제공하는 중요하고 강력한 데이터 구조로, 2차원의 테이블 형태 데이터를 다루는 데 사용됩니다. 또한 각 요소는 인덱스(index), 열(column), 값(value)으로 구성되어 있습니다. 데이터프레임은 행과 열로 이루어져 있으며, 각 열은 다양한 데이터 타입을 가질 수 있습니다. 값은 넘파이의 ndarray 기반으로 저장됩니다.

In [227]:
data = [[67, 93, 91],
        [75, 68, 96],
        [87, 81, 82],
        [62, 70, 75],
        [98, 56, 87]]

idx = ['김사과', '반하나', '오렌지', '이메론', '배애리']
col = ['국어', '영어', '수학']

In [228]:
pd.DataFrame(data)

Unnamed: 0,0,1,2
0,67,93,91
1,75,68,96
2,87,81,82
3,62,70,75
4,98,56,87


In [229]:
pd.DataFrame(data, idx)

Unnamed: 0,0,1,2
김사과,67,93,91
반하나,75,68,96
오렌지,87,81,82
이메론,62,70,75
배애리,98,56,87


In [230]:
pd.DataFrame(data, idx, col)

Unnamed: 0,국어,영어,수학
김사과,67,93,91
반하나,75,68,96
오렌지,87,81,82
이메론,62,70,75
배애리,98,56,87


In [231]:
pd.DataFrame(index=idx, columns=col, data=data)
# 원래는 순서를 지켜야하지만 어떤 매개변수에 넣을지 지정해서 넣으면 순서를 안 지켜도 괜찮음

Unnamed: 0,국어,영어,수학
김사과,67,93,91
반하나,75,68,96
오렌지,87,81,82
이메론,62,70,75
배애리,98,56,87


In [232]:
df = pd.DataFrame(index=idx, columns=col, data=data)
print(df.index)
print(df.columns)
print(df.values)

Index(['김사과', '반하나', '오렌지', '이메론', '배애리'], dtype='object')
Index(['국어', '영어', '수학'], dtype='object')
[[67 93 91]
 [75 68 96]
 [87 81 82]
 [62 70 75]
 [98 56 87]]


In [233]:
# 딕셔너리를 사용하여 데이터프레임을 생성하기
dic = {
    '국어':[67, 75, 76, 62, 98],
    '영어':[93, 68, 81, 70, 56],
    '수학':[91, 96, 82, 75, 87]
}

df = pd.DataFrame(data=dic, index=idx)
df

Unnamed: 0,국어,영어,수학
김사과,67,93,91
반하나,75,68,96
오렌지,76,81,82
이메론,62,70,75
배애리,98,56,87


# **3. CSV 파일 읽어오기**
CSV 파일은 Comma-Separated Values(쉼표로 구분된 값) 파일의 약자로, 데이터를 단순한 텍스트 형식으로 저장하는 데 사용되는 파일 형식입니다.

In [234]:
df = pd.read_csv('/content/drive/MyDrive/랭체인 AI 영상객체탐지분석 플랫폼/9. 데이터 분석/data/idol.csv')
df

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [235]:
type(df)

# **4. 데이터프레임 기본 정보 알아보기**

In [236]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   이름       20 non-null     object 
 1   그룹       20 non-null     object 
 2   소속사      19 non-null     object 
 3   성별       20 non-null     object 
 4   생년월일     20 non-null     object 
 5   키        19 non-null     float64
 6   혈액형      19 non-null     object 
 7   브랜드평판지수  20 non-null     int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ KB


In [237]:
df.columns

Index(['이름', '그룹', '소속사', '성별', '생년월일', '키', '혈액형', '브랜드평판지수'], dtype='object')

In [238]:
new_columns = ['name', 'group', 'company', 'gender', 'birthday', 'height', 'blood', 'brand']
df.columns = new_columns
df

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [239]:
# describe(): 통계 정보를 반환
df.describe()

Unnamed: 0,height,brand
count,19.0,20.0
mean,170.536842,2700190.0
std,7.225204,1381919.0
min,161.0,1680587.0
25%,164.75,1887423.0
50%,168.0,2074682.0
75%,179.0,2623465.0
max,182.0,6267302.0


In [240]:
df.describe(include=object) # Top: 최빈값, freq: 최빈값의 빈도수

Unnamed: 0,name,group,company,gender,birthday,blood
count,20,20,19,20,20,19
unique,20,6,5,2,20,4
top,지민,방탄소년단,빅히트,여자,1995-10-13,A
freq,1,5,7,13,1,11


In [241]:
df.head() # 상위 5개

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048


In [242]:
df.head(3)

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081


In [243]:
df.tail()

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
15,윤아,소녀시대,에스엠,여자,1990-05-30,168.0,B,1885297
16,조이,레드벨벳,빅히트,여자,1996-09-03,168.0,A,1830514
17,슬기,레드벨벳,빅히트,여자,1994-02-10,161.0,A,1741767
18,강다니엘,워너원,,남자,1996-12-10,182.0,A,1706444
19,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587


In [244]:
df.tail(2)

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
18,강다니엘,워너원,,남자,1996-12-10,182.0,A,1706444
19,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587


In [245]:
# 정렬
df.sort_index() # index로 오름차순 정렬: 기본값

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [246]:
df.sort_index(ascending=False)

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
19,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587
18,강다니엘,워너원,,남자,1996-12-10,182.0,A,1706444
17,슬기,레드벨벳,빅히트,여자,1994-02-10,161.0,A,1741767
16,조이,레드벨벳,빅히트,여자,1996-09-03,168.0,A,1830514
15,윤아,소녀시대,에스엠,여자,1990-05-30,168.0,B,1885297
14,로제,블랙핑크,와이지,여자,1997-02-11,168.0,B,1888132
13,리사,블랙핑크,와이지,여자,1997-03-27,167.0,A,1912800
12,옹성우,워너원,판타지오,남자,1995-08-25,179.0,A,1954327
11,제니,블랙핑크,와이지,여자,1996-01-16,163.0,B,2069250
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499


In [247]:
df.sort_values(by="height")

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
17,슬기,레드벨벳,빅히트,여자,1994-02-10,161.0,A,1741767
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
11,제니,블랙핑크,와이지,여자,1996-01-16,163.0,B,2069250
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
13,리사,블랙핑크,와이지,여자,1997-03-27,167.0,A,1912800
14,로제,블랙핑크,와이지,여자,1997-02-11,168.0,B,1888132
16,조이,레드벨벳,빅히트,여자,1996-09-03,168.0,A,1830514
15,윤아,소녀시대,에스엠,여자,1990-05-30,168.0,B,1885297


In [248]:
df.sort_values(by="height", ascending=False)

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
18,강다니엘,워너원,,남자,1996-12-10,182.0,A,1706444
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
19,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
12,옹성우,워너원,판타지오,남자,1995-08-25,179.0,A,1954327
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
14,로제,블랙핑크,와이지,여자,1997-02-11,168.0,B,1888132


In [249]:
df.sort_values(by="height", ascending=False, na_position="first")  # last가 기본값

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866
18,강다니엘,워너원,,남자,1996-12-10,182.0,A,1706444
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
19,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
12,옹성우,워너원,판타지오,남자,1995-08-25,179.0,A,1954327
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081


In [250]:
df.sort_values(by=["height", "brand"], ascending=[False, False], na_position="first")

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866
18,강다니엘,워너원,,남자,1996-12-10,182.0,A,1706444
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
12,옹성우,워너원,판타지오,남자,1995-08-25,179.0,A,1954327
19,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081


# **5. 데이터 다루기**

In [251]:
df.head()

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048


In [252]:
df['blood']

Unnamed: 0,blood
0,A
1,A
2,A
3,O
4,AB
5,
6,O
7,A
8,B
9,A


In [253]:
type(df['blood'])

In [254]:
df.blood

Unnamed: 0,blood
0,A
1,A
2,A
3,O
4,AB
5,
6,O
7,A
8,B
9,A


In [255]:
df.head(3)

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081


In [256]:
df[:3]

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081


In [257]:
# loc 인덱싱: 컬럼 인덱싱
df.loc[:, 'name'] # df['name'], df.name

Unnamed: 0,name
0,지민
1,정국
2,민지
3,하니
4,뷔
5,다니엘
6,혜인
7,지수
8,해린
9,태연


In [258]:
df.loc[2:5, 'name'] # 5번을 포함

Unnamed: 0,name
2,민지
3,하니
4,뷔
5,다니엘


In [259]:
df.loc[2:5, ['name', 'gender', 'height']]

Unnamed: 0,name,gender,height
2,민지,여자,169.0
3,하니,여자,161.7
4,뷔,남자,179.0
5,다니엘,여자,165.0


In [260]:
df.loc[[2,5], ['name', 'gender', 'height']]

Unnamed: 0,name,gender,height
2,민지,여자,169.0
5,다니엘,여자,165.0


In [261]:
df.loc[2:5, 'name':'height']

Unnamed: 0,name,group,company,gender,birthday,height
2,민지,뉴진스,어도어,여자,2004-05-07,169.0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0


In [262]:
# df.iloc[행에 대한 조건, 컬럼] (인덱스)
df.iloc[:, 0]

Unnamed: 0,name
0,지민
1,정국
2,민지
3,하니
4,뷔
5,다니엘
6,혜인
7,지수
8,해린
9,태연


In [263]:
df.iloc[:, 0:3] # 3을 포함하지 않음

Unnamed: 0,name,group,company
0,지민,방탄소년단,빅히트
1,정국,방탄소년단,빅히트
2,민지,뉴진스,어도어
3,하니,뉴진스,어도어
4,뷔,방탄소년단,빅히트
5,다니엘,뉴진스,어도어
6,혜인,뉴진스,어도어
7,지수,블랙핑크,와이지
8,해린,뉴진스,어도어
9,태연,소녀시대,에스엠


In [264]:
df.iloc[:, [0, 3]]

Unnamed: 0,name,gender
0,지민,남자
1,정국,남자
2,민지,여자
3,하니,여자
4,뷔,남자
5,다니엘,여자
6,혜인,여자
7,지수,여자
8,해린,여자
9,태연,여자


In [265]:
df.iloc[1:5, 0:2]

Unnamed: 0,name,group
1,정국,방탄소년단
2,민지,뉴진스
3,하니,뉴진스
4,뷔,방탄소년단


In [266]:
df['height'] >= 180

Unnamed: 0,height
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
9,False


In [267]:
df[df['height'] >= 180]

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499
18,강다니엘,워너원,,남자,1996-12-10,182.0,A,1706444


In [268]:
df[df['height'] >= 180]['name'] # df['name'][df['height'] >= 180]

Unnamed: 0,name
10,RM
18,강다니엘


In [269]:
df[df['height'] >= 180][['name', 'gender', 'height']]

Unnamed: 0,name,gender,height
10,RM,남자,181.0
18,강다니엘,남자,182.0


In [270]:
df.loc[df['height'] >= 180, ['name', 'gender', 'height']]

Unnamed: 0,name,gender,height
10,RM,남자,181.0
18,강다니엘,남자,182.0


In [271]:
df

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [272]:
company = ['빅히트', '어도어']
# isin(): 해당 요소가 데이터와 일치하면 True, 아니면 False
df['company'].isin(company)

Unnamed: 0,company
0,True
1,True
2,True
3,True
4,True
5,True
6,True
7,False
8,True
9,False


In [273]:
df[df['company'].isin(company)] # df.loc[df['company'].isin(company),:]

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499
16,조이,레드벨벳,빅히트,여자,1996-09-03,168.0,A,1830514


# **6. 결측값**
결측값은 값이 누락된 데이터를 의미하며, 판다스에서는 일반적으로 NaN으로 표시됩니다.

In [274]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      20 non-null     object 
 1   group     20 non-null     object 
 2   company   19 non-null     object 
 3   gender    20 non-null     object 
 4   birthday  20 non-null     object 
 5   height    19 non-null     float64
 6   blood     19 non-null     object 
 7   brand     20 non-null     int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ KB


In [275]:
df

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [276]:
df.isna()

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,True,False
6,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False
9,False,False,False,False,False,True,False,False


In [277]:
df['height'].isna()

Unnamed: 0,height
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
9,True


In [278]:
df[df['height'].isna()]

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [279]:
df[df['height'].isna()]['name']

Unnamed: 0,name
9,태연


In [280]:
df.isnull() # df.notnull Nan이 아닌 값

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,True,False
6,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False
9,False,False,False,False,False,True,False,False


In [281]:
df[df['height'].isnull()]

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [282]:
df[~df['height'].isnull()] #df[df['height'].notnull()]

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499


In [283]:
df.loc[df['company'].notnull(), ['name', 'company', 'group', 'gender']]

Unnamed: 0,name,company,group,gender
0,지민,빅히트,방탄소년단,남자
1,정국,빅히트,방탄소년단,남자
2,민지,어도어,뉴진스,여자
3,하니,어도어,뉴진스,여자
4,뷔,빅히트,방탄소년단,남자
5,다니엘,어도어,뉴진스,여자
6,혜인,어도어,뉴진스,여자
7,지수,와이지,블랙핑크,여자
8,해린,어도어,뉴진스,여자
9,태연,에스엠,소녀시대,여자


In [284]:
# fillna(): 결측값을 채워주는 함수
df['height'].fillna(0) # df['height].fillna(0, inplace=True) 적용

Unnamed: 0,height
0,174.0
1,179.0
2,169.0
3,161.7
4,179.0
5,165.0
6,170.0
7,162.0
8,164.5
9,0.0


In [285]:
df['height']

Unnamed: 0,height
0,174.0
1,179.0
2,169.0
3,161.7
4,179.0
5,165.0
6,170.0
7,162.0
8,164.5
9,


In [286]:
df_copy = df.copy()
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [287]:
height = df_copy['height'].mean()
height

np.float64(170.53684210526316)

In [288]:
df_copy['height'] = df_copy['height'].fillna(height)
df_copy['height']

Unnamed: 0,height
0,174.0
1,179.0
2,169.0
3,161.7
4,179.0
5,165.0
6,170.0
7,162.0
8,164.5
9,170.536842


In [289]:
df_copy = df.copy()
df_copy['height']

Unnamed: 0,height
0,174.0
1,179.0
2,169.0
3,161.7
4,179.0
5,165.0
6,170.0
7,162.0
8,164.5
9,


In [290]:
df_copy['height'].fillna(df_copy['height'].median(), inplace=True)
df_copy['height']

Unnamed: 0,height
0,174.0
1,179.0
2,169.0
3,161.7
4,179.0
5,165.0
6,170.0
7,162.0
8,164.5
9,168.0


In [291]:
# dropna(): 결측값이 있는 행 또는 열을 제거. 결측값이 한개라도 있는 경우 삭제
# axis=0이 기본값(행 제거)
df_copy.dropna()

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,168.0,A,2079866
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499


In [292]:
df_copy.dropna(axis=1) # 결측값이 있는 열을 제거

Unnamed: 0,name,group,gender,birthday,height,brand
0,지민,방탄소년단,남자,1995-10-13,174.0,6267302
1,정국,방탄소년단,남자,1997-09-01,179.0,5805844
2,민지,뉴진스,여자,2004-05-07,169.0,4437081
3,하니,뉴진스,여자,2004-10-06,161.7,4161153
4,뷔,방탄소년단,남자,1995-12-30,179.0,3470048
5,다니엘,뉴진스,여자,2005-04-11,165.0,2341271
6,혜인,뉴진스,여자,2008-04-21,170.0,2301785
7,지수,블랙핑크,여자,1995-01-03,162.0,2227460
8,해린,뉴진스,여자,2006-05-15,164.5,2173376
9,태연,소녀시대,여자,1989-03-09,168.0,2079866


# **7. 행, 열 추가 및 삭제하기**

In [293]:
df_copy = df.copy()
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [294]:
dic = {
    'name': '김사과',
    'group': '과수원',
    'company': '애플',
    'gender': '여자',
    'birthday': '2000-01-01',
    'height': 160.0,
    'blood': 'A',
    'brand': 1234567
}

In [295]:
# concat(): 데이터를 합침. axis=0 (기본값)
df_copy = pd.concat([df_copy, pd.DataFrame(dic, index=[0])], ignore_index=True)
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [296]:
dic = {
    'name': '반하나',
    'group': '과수원',
    'company': '애플',
    'gender': '여자',
    'birthday': '1995-01-01',
    'height': 165.0,
    'blood': 'B',
    'brand': 1000000
}

In [297]:
df_copy.loc[len(df_copy)] = dic  # 컬럼명이 모두 같아야 추가됨
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [298]:
df_copy['nation'] = '대한민국'
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand,nation
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,대한민국
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,대한민국
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,대한민국
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,대한민국
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,대한민국
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,대한민국
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,대한민국
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,대한민국
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,대한민국
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,대한민국


In [299]:
df_copy.loc[df_copy['name'] == '김사과', 'nation'] = '미국'
df_copy.tail()

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand,nation
17,슬기,레드벨벳,빅히트,여자,1994-02-10,161.0,A,1741767,대한민국
18,강다니엘,워너원,,남자,1996-12-10,182.0,A,1706444,대한민국
19,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587,대한민국
20,김사과,과수원,애플,여자,2000-01-01,160.0,A,1234567,미국
21,반하나,과수원,애플,여자,1995-01-01,165.0,B,1000000,대한민국


In [300]:
# 행 제거하기
df_copy.drop(20, axis=0) # 0: 행, 1: 열

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand,nation
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,대한민국
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,대한민국
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,대한민국
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,대한민국
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,대한민국
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,대한민국
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,대한민국
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,대한민국
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,대한민국
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,대한민국


In [301]:
df_copy.drop([1, 3, 5, 7, 20, 21], axis=0)

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand,nation
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,대한민국
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,대한민국
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,대한민국
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,대한민국
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,대한민국
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,대한민국
10,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499,대한민국
11,제니,블랙핑크,와이지,여자,1996-01-16,163.0,B,2069250,대한민국
12,옹성우,워너원,판타지오,남자,1995-08-25,179.0,A,1954327,대한민국
13,리사,블랙핑크,와이지,여자,1997-03-27,167.0,A,1912800,대한민국


In [302]:
# 열 제거하기
df_copy.drop('nation', axis=1)

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [303]:
df_copy.drop(['nation', 'group', 'gender'], axis=1)

Unnamed: 0,name,company,birthday,height,blood,brand
0,지민,빅히트,1995-10-13,174.0,A,6267302
1,정국,빅히트,1997-09-01,179.0,A,5805844
2,민지,어도어,2004-05-07,169.0,A,4437081
3,하니,어도어,2004-10-06,161.7,O,4161153
4,뷔,빅히트,1995-12-30,179.0,AB,3470048
5,다니엘,어도어,2005-04-11,165.0,,2341271
6,혜인,어도어,2008-04-21,170.0,O,2301785
7,지수,와이지,1995-01-03,162.0,A,2227460
8,해린,어도어,2006-05-15,164.5,B,2173376
9,태연,에스엠,1989-03-09,,A,2079866


# **8. 통계 함수**

In [304]:
df_copy.describe()

Unnamed: 0,height,brand
count,21.0,22.0
mean,169.771429,2556290.0
std,7.311097,1395015.0
min,160.0,1000000.0
25%,164.5,1844210.0
50%,168.0,2069374.0
75%,179.0,2331400.0
max,182.0,6267302.0


In [305]:
df_copy['height'].sum() # 합계

np.float64(3565.2)

In [306]:
df_copy['height'].count() # 개수, NaN은 포함하지 않음

np.int64(21)

In [307]:
df_copy['height'].mean() # 평균

np.float64(169.77142857142857)

In [308]:
df_copy['height'].median() # 중앙값

168.0

※ 평균과 중앙값

평균은 모든 데이터를 더한 후, 데이터 개수로 나눈 값입니다. 데이터를 고르게 분배했을 때, 한 데이터가 가질 수 있는 이론적인 중심값을 의미합니다. 중앙값은 데이터를 크기 순서대로 정렬했을 때, 가운데 위치하는 값입니다. 데이터의 순서에만 영향을 받고, 값의 크기에는 영향을 받지 않습니다. 데이터가 고르게 분포된 경우 평균과 중앙값이 비슷하거나 같습니다. 하지만 데이터에 극단값(Outlier)이 있는 경우 평균은 극단값의 영향을 받아 왜곡될 수 있지만, 중앙값은 비교적 안정적입니다.

In [309]:
df_copy['height'].max() # 최대값

182.0

In [310]:
df_copy['height'].min() # 최소값

160.0

In [311]:
df_copy['height'].var() # 분산

53.45214285714286

In [312]:
df_copy['height'].std() # 표준편차

7.311097240301407

※ 분산과 표준편차

분산(Variance)과 표준편차(Standard Deviation)는 데이터가 평균에서 얼마나 퍼져 있는지를 나타내는 산포도(분포 정도)를 측정하는 지표입니다. 분산은 데이터가 평균을 기준으로 얼마나 퍼져 있는지를 나타냅니다. 평균에서 각 데이터의 거리를 제곱한 값들의 평균입니다. 표준편차는 분산의 제곱근입니다. 분산은 제곱 값이기 때문에 단위가 커질 수 있는데, 이를 원래 데이터와 같은 단위로 변환하기 위해 제곱근을 씌웁니다.

<img src="https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdna%2FlSDKp%2FbtsLh5aU1e8%2FAAAAAAAAAAAAAAAAAAAAALLo-v9lCG02Pjz21LsA2v3fa3ieSoACoMfBNnkw9FCc%2Fimg.png%3Fcredential%3DyqXZFxpELC7KVnFOS48ylbz2pIh7yKj8%26expires%3D1767193199%26allow_ip%3D%26allow_referer%3D%26signature%3DxwMZhWgOxQ2CmRKqwvQnZ%252FlQUro%253D">

# **9. 그룹**

In [313]:
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand,nation
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,대한민국
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,대한민국
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,대한민국
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,대한민국
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,대한민국
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,대한민국
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,대한민국
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,대한민국
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,대한민국
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,대한민국


In [314]:
df_copy.groupby('group')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ea9076b3c20>

In [315]:
df_copy.groupby('group').count()

Unnamed: 0_level_0,name,company,gender,birthday,height,blood,brand,nation
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
과수원,2,2,2,2,2,2,2,2
뉴진스,5,5,5,5,5,4,5,5
레드벨벳,2,2,2,2,2,2,2,2
방탄소년단,5,5,5,5,5,5,5,5
블랙핑크,4,4,4,4,4,4,4,4
소녀시대,2,2,2,2,1,2,2,2
워너원,2,1,2,2,2,2,2,2


In [316]:
df_copy.groupby('group').mean(numeric_only=True)

Unnamed: 0_level_0,height,brand
group,Unnamed: 1_level_1,Unnamed: 2_level_1
과수원,162.5,1117283.5
뉴진스,166.04,3082933.2
레드벨벳,164.5,1786140.5
방탄소년단,178.4,3858656.0
블랙핑크,165.0,2024410.5
소녀시대,168.0,1982581.5
워너원,180.5,1830385.5


In [317]:
df_copy.groupby('group').sum(numeric_only=True)

Unnamed: 0_level_0,height,brand
group,Unnamed: 1_level_1,Unnamed: 2_level_1
과수원,325.0,2234567
뉴진스,830.2,15414666
레드벨벳,329.0,3572281
방탄소년단,892.0,19293280
블랙핑크,660.0,8097642
소녀시대,168.0,3965163
워너원,361.0,3660771


In [318]:
df_copy.groupby('gender').mean(numeric_only=True)

Unnamed: 0_level_0,height,brand
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
남자,179.0,3279150.0
여자,165.157143,2218955.0


In [319]:
df_copy.groupby(['blood','gender']).mean(numeric_only=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,height,brand
blood,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
A,남자,179.0,3560683.0
A,여자,164.5,2209151.0
AB,남자,179.0,3470048.0
B,여자,165.7,1803211.0
O,남자,179.0,1680587.0
O,여자,165.85,3231469.0


In [320]:
df_copy.groupby(['blood','gender'])['height'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,height
blood,gender,Unnamed: 2_level_1
A,남자,179.0
A,여자,164.5
AB,남자,179.0
B,여자,165.7
O,남자,179.0
O,여자,165.85


# **10. 중복값 제거하기**

In [321]:
df_copy['blood']

Unnamed: 0,blood
0,A
1,A
2,A
3,O
4,AB
5,
6,O
7,A
8,B
9,A


In [322]:
df_copy['blood'].drop_duplicates()

Unnamed: 0,blood
0,A
3,O
4,AB
5,
8,B


In [323]:
df_copy['blood'].drop_duplicates(keep='last')

Unnamed: 0,blood
4,AB
5,
19,O
20,A
21,B


In [324]:
df_copy['blood'].value_counts() # NaN을 생략(있는지없는지 알수없음)

Unnamed: 0_level_0,count
blood,Unnamed: 1_level_1
A,12
B,5
O,3
AB,1


In [325]:
df_copy['blood'].value_counts(dropna=False)

Unnamed: 0_level_0,count
blood,Unnamed: 1_level_1
A,12
B,5
O,3
AB,1
,1


# **11. 데이터프레임 합치기**

In [326]:
df1 = pd.read_csv('/content/drive/MyDrive/랭체인 AI 영상객체탐지분석 플랫폼/9. 데이터 분석/data/idol.csv')
df2 = pd.read_csv('/content/drive/MyDrive/랭체인 AI 영상객체탐지분석 플랫폼/9. 데이터 분석/data/idol2.csv')

In [327]:
df1

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [328]:
df2

Unnamed: 0,이름,연봉,가족수
0,지민,3000,3
1,정국,3500,3
2,민지,3200,4
3,하니,3050,4
4,뷔,4300,3
5,다니엘,2900,5
6,혜인,3400,6
7,지수,4500,5
8,해린,4200,4
9,태연,4300,4


In [329]:
df_copy = df1.copy()

In [330]:
pd.concat([df1, df_copy]) # axis=0 (기본값)

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [331]:
df_concat = pd.concat([df1, df_copy])
# reset_index(): index를 새롭게 적용
# drop=True 옵션을 사용하여 기존 index가 컬럼으로 만들어지는 것을 방지
df_concat.reset_index(drop=True)

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [332]:
pd.concat([df1, df2], axis=1)  # 같은 index와 결합

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,이름.1,연봉,가족수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,지민,3000,3
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,정국,3500,3
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,민지,3200,4
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,하니,3050,4
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,뷔,4300,3
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,다니엘,2900,5
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,혜인,3400,6
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,지수,4500,5
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,해린,4200,4
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,태연,4300,4


In [333]:
df3 = df2.drop([1, 3, 5, 7, 9])
df3

Unnamed: 0,이름,연봉,가족수
0,지민,3000,3
2,민지,3200,4
4,뷔,4300,3
6,혜인,3400,6
8,해린,4200,4
10,RM,3700,3
11,제니,3850,5
12,옹성우,3900,4
13,리사,4100,3
14,로제,4150,3


In [334]:
pd.concat([df1, df3], axis=1)

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,이름.1,연봉,가족수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,지민,3000.0,3.0
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,,,
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,민지,3200.0,4.0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,,,
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,뷔,4300.0,3.0
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,,,
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,혜인,3400.0,6.0
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,,,
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,해린,4200.0,4.0
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,,,


In [335]:
df_right = df2.drop([1, 3, 5, 7, 9], axis=0)
df_right

Unnamed: 0,이름,연봉,가족수
0,지민,3000,3
2,민지,3200,4
4,뷔,4300,3
6,혜인,3400,6
8,해린,4200,4
10,RM,3700,3
11,제니,3850,5
12,옹성우,3900,4
13,리사,4100,3
14,로제,4150,3


In [336]:
df_right = df_right.reset_index(drop=True)
df_right

Unnamed: 0,이름,연봉,가족수
0,지민,3000,3
1,민지,3200,4
2,뷔,4300,3
3,혜인,3400,6
4,해린,4200,4
5,RM,3700,3
6,제니,3850,5
7,옹성우,3900,4
8,리사,4100,3
9,로제,4150,3


In [337]:
dic = {
    '이름': '김사과',
    '연봉': 9000,
    '가족수': 10
}

In [338]:
df_right = pd.concat([df_right, pd.DataFrame(dic, index=[0])], ignore_index=True)
df_right

Unnamed: 0,이름,연봉,가족수
0,지민,3000,3
1,민지,3200,4
2,뷔,4300,3
3,혜인,3400,6
4,해린,4200,4
5,RM,3700,3
6,제니,3850,5
7,옹성우,3900,4
8,리사,4100,3
9,로제,4150,3


In [339]:
pd.concat([df1, df_right], axis=1)

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,이름.1,연봉,가족수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,지민,3000.0,3.0
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,민지,3200.0,4.0
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,뷔,4300.0,3.0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,혜인,3400.0,6.0
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,해린,4200.0,4.0
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,RM,3700.0,3.0
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,제니,3850.0,5.0
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,옹성우,3900.0,4.0
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,리사,4100.0,3.0
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,로제,4150.0,3.0


In [340]:
# merge(): 특정 고유한 키(unique, id)값을 기준으로 합침
# merge(데이터프레임1, 데이터프레임2, on='고유한컬럼', how='병합의 기준')
# 병합의 기준: left, right, inner, cross
pd.merge(df1, df_right, on='이름', how='left')

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,연봉,가족수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,3000.0,3.0
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,,
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,3200.0,4.0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,,
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,4300.0,3.0
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,,
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,3400.0,6.0
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,,
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,4200.0,4.0
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,,


In [341]:
pd.merge(df1, df_right, on='이름', how='right')

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,연봉,가족수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302.0,3000,3
1,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081.0,3200,4
2,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048.0,4300,3
3,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785.0,3400,6
4,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376.0,4200,4
5,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499.0,3700,3
6,제니,블랙핑크,와이지,여자,1996-01-16,163.0,B,2069250.0,3850,5
7,옹성우,워너원,판타지오,남자,1995-08-25,179.0,A,1954327.0,3900,4
8,리사,블랙핑크,와이지,여자,1997-03-27,167.0,A,1912800.0,4100,3
9,로제,블랙핑크,와이지,여자,1997-02-11,168.0,B,1888132.0,4150,3


In [342]:
pd.merge(df1, df_right, on='이름', how='inner')

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,연봉,가족수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,3000,3
1,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,3200,4
2,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,4300,3
3,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,3400,6
4,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,4200,4
5,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499,3700,3
6,제니,블랙핑크,와이지,여자,1996-01-16,163.0,B,2069250,3850,5
7,옹성우,워너원,판타지오,남자,1995-08-25,179.0,A,1954327,3900,4
8,리사,블랙핑크,와이지,여자,1997-03-27,167.0,A,1912800,4100,3
9,로제,블랙핑크,와이지,여자,1997-02-11,168.0,B,1888132,4150,3


In [343]:
pd.merge(df1, df_right, how='cross')

Unnamed: 0,이름_x,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,이름_y,연봉,가족수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,지민,3000,3
1,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,민지,3200,4
2,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,뷔,4300,3
3,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,혜인,3400,6
4,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,해린,4200,4
...,...,...,...,...,...,...,...,...,...,...,...
315,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587,조이,3500,3
316,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587,슬기,3200,4
317,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587,강다니엘,3050,4
318,진,방탄소년단,빅히트,남자,1992-12-04,179.0,O,1680587,진,4300,3


In [344]:
df_right.columns = ['성함', '연봉', '가족수']
df_right

Unnamed: 0,성함,연봉,가족수
0,지민,3000,3
1,민지,3200,4
2,뷔,4300,3
3,혜인,3400,6
4,해린,4200,4
5,RM,3700,3
6,제니,3850,5
7,옹성우,3900,4
8,리사,4100,3
9,로제,4150,3


In [345]:
# pd.merge(df1, df_right, on='이름', how='inner') KeyError: '이름'
pd.merge(df1, df_right, left_on='이름', right_on='성함', how='inner')

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,성함,연봉,가족수
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,지민,3000,3
1,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,민지,3200,4
2,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,뷔,4300,3
3,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,혜인,3400,6
4,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,해린,4200,4
5,RM,방탄소년단,빅히트,남자,1994-09-12,181.0,A,2069499,RM,3700,3
6,제니,블랙핑크,와이지,여자,1996-01-16,163.0,B,2069250,제니,3850,5
7,옹성우,워너원,판타지오,남자,1995-08-25,179.0,A,1954327,옹성우,3900,4
8,리사,블랙핑크,와이지,여자,1997-03-27,167.0,A,1912800,리사,4100,3
9,로제,블랙핑크,와이지,여자,1997-02-11,168.0,B,1888132,로제,4150,3


# **12. 등수 매기기**

In [346]:
# rank(): 데이터프레임 또는 시리즈의 순위를 매기는 함수. 기본값은 ascending
df1['브랜드순위'] = df1['브랜드평판지수'].rank()
df1

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,브랜드순위
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,20.0
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,19.0
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,18.0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,17.0
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,16.0
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,15.0
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,14.0
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,13.0
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,12.0
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,11.0


In [347]:
df1['브랜드순위'] = df1['브랜드평판지수'].rank(ascending=False)
df1

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,브랜드순위
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,1.0
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,2.0
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,3.0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,4.0
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,5.0
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,6.0
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,7.0
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,8.0
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,9.0
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,10.0


In [348]:
# astype(): 특정열의 자료형을 변경
df1['브랜드순위'] = df1['브랜드순위'].astype(int)
df1

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,브랜드순위
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,1
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,2
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,3
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,4
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,5
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,6
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,7
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,8
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,9
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,10


In [349]:
df1['브랜드순위'].dtypes

dtype('int64')

# **13. 날짜타입 사용하기**

In [350]:
df_copy = df.copy()
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [351]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      20 non-null     object 
 1   group     20 non-null     object 
 2   company   19 non-null     object 
 3   gender    20 non-null     object 
 4   birthday  20 non-null     object 
 5   height    19 non-null     float64
 6   blood     19 non-null     object 
 7   brand     20 non-null     int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ KB


In [352]:
# to_datatime(): object타입에서 datetime타입으로 변환
df_copy['birthday'] = pd.to_datetime(df_copy['birthday'])
print(type(df_copy['birthday']))
print(df_copy['birthday'].dtypes)

<class 'pandas.core.series.Series'>
datetime64[ns]


In [353]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   name      20 non-null     object        
 1   group     20 non-null     object        
 2   company   19 non-null     object        
 3   gender    20 non-null     object        
 4   birthday  20 non-null     datetime64[ns]
 5   height    19 non-null     float64       
 6   blood     19 non-null     object        
 7   brand     20 non-null     int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 1.4+ KB


In [354]:
df_copy['birthday'].dt.year

Unnamed: 0,birthday
0,1995
1,1997
2,2004
3,2004
4,1995
5,2005
6,2008
7,1995
8,2006
9,1989


In [355]:
df_copy['birthday'].dt.month

Unnamed: 0,birthday
0,10
1,9
2,5
3,10
4,12
5,4
6,4
7,1
8,5
9,3


In [356]:
df_copy['birthday'].dt.day

Unnamed: 0,birthday
0,13
1,1
2,7
3,6
4,30
5,11
6,21
7,3
8,15
9,9


In [357]:
df_copy['birthday'].dt.hour

Unnamed: 0,birthday
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [358]:
df_copy['birthday'].dt.minute

Unnamed: 0,birthday
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [359]:
df_copy['birthday'].dt.second

Unnamed: 0,birthday
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [360]:
df_copy['birthday'].dt.dayofweek # 요일: 0(월요일) ~ 6(일요일)

Unnamed: 0,birthday
0,4
1,0
2,4
3,2
4,5
5,0
6,0
7,1
8,0
9,3


In [361]:
df_copy['birthday'].dt.isocalendar().week

Unnamed: 0,week
0,41
1,36
2,19
3,41
4,52
5,15
6,17
7,1
8,20
9,10


# **14. apply 사용하기**

Pandas의 apply() 함수는 데이터프레임이나 시리즈의 데이터를 사용자 정의 함수 또는 내장 함수에 적용하여 새로운 값을 계산하거나 변환할 때 사용됩니다. 데이터를 행(row) 또는 열(column) 단위로 처리할 수 있는 강력한 도구입니다.


In [362]:
df_copy = df.copy()
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [363]:
df_copy.loc[df_copy['gender'] == '남자', 'gender'] = 1
df_copy.loc[df_copy['gender'] == '여자', 'gender'] = 0
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,1,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,1,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,0,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,0,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,1,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,0,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,0,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,0,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,0,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,0,1989-03-09,,A,2079866


In [364]:
df_copy = df.copy()
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [365]:
def male_or_female(x):
    if x == '남자':
        return 1
    elif x == '여자':
        return 0
    else:
        return None

In [366]:
print(male_or_female('남자'))
print(male_or_female('여자'))

1
0


In [367]:
df_copy['gender'].apply(male_or_female)

Unnamed: 0,gender
0,1
1,1
2,0
3,0
4,1
5,0
6,0
7,0
8,0
9,0


In [368]:
df_copy['gender'].apply(lambda x: 1 if x == '남자' else 0)

Unnamed: 0,gender
0,1
1,1
2,0
3,0
4,1
5,0
6,0
7,0
8,0
9,0


In [369]:
df_copy['NewGender'] = df_copy['gender'].apply(lambda x: 1 if x == '남자' else 0)
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand,NewGender
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,1
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,1
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,0
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,1
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,0
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,0
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,0
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,0
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,0


# **15. map 사용하기**

Pandas의 map() 함수는 Series 객체에서 사용할 수 있는 함수로, 각 요소에 대해 함수나 매핑 규칙을 적용하여 새로운 값을 계산하거나 변환할 때 사용됩니다. map()은 데이터의 각 요소를 순회하며 특정 작업을 수행하므로, 데이터를 가공하거나 변환하는 데 유용합니다.

In [370]:
df_copy = df.copy()
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [371]:
map_gender = {'남자':1, '여자':0}

In [372]:
df_copy['gender'].map(map_gender)

Unnamed: 0,gender
0,1
1,1
2,0
3,0
4,1
5,0
6,0
7,0
8,0
9,0


In [373]:
df_copy['NewGender'] = df_copy['gender'].map(map_gender)
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand,NewGender
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,1
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,1
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,0
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,1
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,0
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,0
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,0
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,0
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,0


# **16. 데이터프레임의 산술연산**

In [374]:
df1 = pd.DataFrame({
    '파이썬':[60, 70, 80, 90, 95],
    '데이터분석':[40, 60, 70, 55, 87],
    '머신러닝딥러닝':[35, 40, 30, 70, 55]
})

df1

Unnamed: 0,파이썬,데이터분석,머신러닝딥러닝
0,60,40,35
1,70,60,40
2,80,70,30
3,90,55,70
4,95,87,55


In [375]:
df1['파이썬'].dtypes

dtype('int64')

In [376]:
type(df1['파이썬'])

In [377]:
df1['파이썬'] + df1['데이터분석'] + df1['머신러닝딥러닝']

Unnamed: 0,0
0,135
1,170
2,180
3,215
4,237


In [378]:
# df1에 총점, 평균이라는 파생변수를 만들고 파생변수에 총점, 평균을 구해서 저장
df1['총점'] = df1['파이썬'] + df1['데이터분석'] + df1['머신러닝딥러닝']
df1['평균'] = df1['총점'] / 3
df1

Unnamed: 0,파이썬,데이터분석,머신러닝딥러닝,총점,평균
0,60,40,35,135,45.0
1,70,60,40,170,56.666667
2,80,70,30,180,60.0
3,90,55,70,215,71.666667
4,95,87,55,237,79.0


In [379]:
df1['파이썬'].sum() # df1['파이썬'].sum(axis=0)

np.int64(395)

In [380]:
df1['파이썬'].mean()

np.float64(79.0)

In [381]:
df1.sum()

Unnamed: 0,0
파이썬,395.0
데이터분석,312.0
머신러닝딥러닝,230.0
총점,937.0
평균,312.333333


In [382]:
df1.mean()

Unnamed: 0,0
파이썬,79.0
데이터분석,62.4
머신러닝딥러닝,46.0
총점,187.4
평균,62.466667


In [383]:
df1 = pd.DataFrame({
    '파이썬':[60, 70, 80, 90, 95],
    '데이터분석':[40, 60, 70, 55, 87],
    '머신러닝딥러닝':[35, 40, 30, 70, 55]
})

df2 = pd.DataFrame({
    '파이썬':['C', 'B', 'B', 'A', 'A'],
    '데이터분석':[40, 60, 70, 55, 87],
    '머신러닝딥러닝':[35, 40, 30, 70, 55]
})

In [384]:
# df1 + df2 # TypeError: unsupported operand type(s) for +: 'int' and 'str'
df1 + 10
# df2 + 10 # TypeError: can only concatenate str (not "int") to str

Unnamed: 0,파이썬,데이터분석,머신러닝딥러닝
0,70,50,45
1,80,70,50
2,90,80,40
3,100,65,80
4,105,97,65


In [385]:
df1 = pd.DataFrame({
    '데이터분석':[40, 60, 70, 55, 87],
    '머신러닝딥러닝':[35, 40, 30, 70, 55]
})

df2 = pd.DataFrame({
    '데이터분석':[40, 60, 70, 55],
    '머신러닝딥러닝':[35, 40, 30, 70]
})

In [386]:
df1 + df2 # 행의 갯수가 다를 경우 빠진 데이터를 NaN으로 취급하기 때문에 결과는 NaN

Unnamed: 0,데이터분석,머신러닝딥러닝
0,80.0,70.0
1,120.0,80.0
2,140.0,60.0
3,110.0,140.0
4,,


# **17. select_dtypes**

In [387]:
df_copy = df.copy()
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [388]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      20 non-null     object 
 1   group     20 non-null     object 
 2   company   19 non-null     object 
 3   gender    20 non-null     object 
 4   birthday  20 non-null     object 
 5   height    19 non-null     float64
 6   blood     19 non-null     object 
 7   brand     20 non-null     int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ KB


In [389]:
df_copy.select_dtypes(include='object') # 문자열 컬럼만 가져오기

Unnamed: 0,name,group,company,gender,birthday,blood
0,지민,방탄소년단,빅히트,남자,1995-10-13,A
1,정국,방탄소년단,빅히트,남자,1997-09-01,A
2,민지,뉴진스,어도어,여자,2004-05-07,A
3,하니,뉴진스,어도어,여자,2004-10-06,O
4,뷔,방탄소년단,빅히트,남자,1995-12-30,AB
5,다니엘,뉴진스,어도어,여자,2005-04-11,
6,혜인,뉴진스,어도어,여자,2008-04-21,O
7,지수,블랙핑크,와이지,여자,1995-01-03,A
8,해린,뉴진스,어도어,여자,2006-05-15,B
9,태연,소녀시대,에스엠,여자,1989-03-09,A


In [390]:
df_copy.select_dtypes(exclude='object') # 문자열 컬럼만 빼고 가져오기

Unnamed: 0,height,brand
0,174.0,6267302
1,179.0,5805844
2,169.0,4437081
3,161.7,4161153
4,179.0,3470048
5,165.0,2341271
6,170.0,2301785
7,162.0,2227460
8,164.5,2173376
9,,2079866


In [391]:
# 문자가 아닌 컬럼에만 10을 더함
df_copy.select_dtypes(exclude='object') + 10

Unnamed: 0,height,brand
0,184.0,6267312
1,189.0,5805854
2,179.0,4437091
3,171.7,4161163
4,189.0,3470058
5,175.0,2341281
6,180.0,2301795
7,172.0,2227470
8,174.5,2173386
9,,2079876


In [392]:
# 문자열을 가지고 있는 컬럼의 이름만 변수에 저장하여 출력
str_cols = df_copy.select_dtypes(include='object').columns
str_cols

Index(['name', 'group', 'company', 'gender', 'birthday', 'blood'], dtype='object')

In [393]:
df[str_cols]

Unnamed: 0,name,group,company,gender,birthday,blood
0,지민,방탄소년단,빅히트,남자,1995-10-13,A
1,정국,방탄소년단,빅히트,남자,1997-09-01,A
2,민지,뉴진스,어도어,여자,2004-05-07,A
3,하니,뉴진스,어도어,여자,2004-10-06,O
4,뷔,방탄소년단,빅히트,남자,1995-12-30,AB
5,다니엘,뉴진스,어도어,여자,2005-04-11,
6,혜인,뉴진스,어도어,여자,2008-04-21,O
7,지수,블랙핑크,와이지,여자,1995-01-03,A
8,해린,뉴진스,어도어,여자,2006-05-15,B
9,태연,소녀시대,에스엠,여자,1989-03-09,A


# **18. get_dummies**
get_dummies()는 Pandas에서 범주형 데이터를 원-핫 인코딩(one-hot encoding) 방식으로 변환하는 데 사용됩니다.

### 1. 데이터의 종류
<img src="https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdna%2FvmnOx%2FdJMcacnX56W%2FAAAAAAAAAAAAAAAAAAAAAGpDja0-RlnxiVyllwjgZkpmqk9189V9r98SJ6LIeOsy%2Fimg.png%3Fcredential%3DyqXZFxpELC7KVnFOS48ylbz2pIh7yKj8%26expires%3D1767193199%26allow_ip%3D%26allow_referer%3D%26signature%3DxW93p%252B%252FdQJ6DxVBH5dtFyZ3UwV8%253D">

### 2. 데이터 분석에서 범주형 컬럼 판단 기준
데이터 분석 또는 머신러닝 모델을 만들 때, 각 컬럼(특징)을 범주형 데이터로 변환할지, 또는 삭제해야 할지 판단하는 과정은 매우 중요합니다. 특히 실제 데이터는 숫자처럼 보이지만 의미가 없는 것들도 많고, 범주형으로 처리하지 않으면 분석 결과가 왜곡되는 경우도 많습니다.

1. 값의 종류가 적고 의미 있는 그룹을 형성할 때

- 성별, 지역, 직급, 요일, 날씨

> 이런 값들은 자연스럽게 “그룹 분석”이 가능하고, 평균·합계를 내는 것보다 범주별 비교가 더 의미 있는 케이스입니다

2. 숫자처럼 보이지만 수학적 의미가 없을 때

- 회원번호. 상품 코드, 우편번호, 전화번호

> 겉으로는 숫자이지만, 덧셈∙평균 같은 연산은 전혀 의미가 없습니다. 따라서 문자열 또는 범주형으로 변환해야 분석에 적합합니다.

3. 머신러닝 입력을 위해 인코딩이 필요한 경우

- 컬러(color): red / blue / green, 브라우저(browser): chrome / edge / safari

>머신러닝은 문자열을 직접 다루지 못하므로, 범주형으로 변환한 뒤 원-핫 인코딩(One-Hot Encoding) 또는 레이블 인코딩(Label Encoding)을 적용합니다.

4. 연속 값이지만 범주로 나누는 것이 더 의미 있을 때

- 나이 → 10대/20대/30대, 매출 → 고/중/저, 위험등급 → Low / Medium / High

> 이처럼 의미 있는 구간(bin)을 만들어 카테고리화하면 분석 인사이트가 더 명확해집니다.

### 3. 어떤 컬럼은 제거(drop)하는 것이 좋을까?

1. 고유값(Unique)이 너무 많아 패턴이 없는 경우

- 주문번호, 주민번호, UUID, 세션 ID, 상세주소

> 이런 컬럼들은 대부분 값이 전부 다르기 때문에 모델이 패턴을 학습할 수 없고 오히려 노이즈로 작용합니다.

2. 지나치게 상세한 정보(과도한 해상도)

- 초 단위 timestamp, 도로명 + 상세 주소 (건물번호, 호수까지 포함)

> 이런 값들은 정보는 많지만 분석에서 의미 있는 패턴을 제공하지 않습니다. 일반적으로 다음처럼 가공해야 합니다:

```
시간 → year, month, hour, weekday, 주소 → 시/구 단위
```

3. 결측치가 지나치게 많아 의미가 없을 때
예: 전체의 80% 이상이 null

대부분의 경우 해당 컬럼은 과감히 삭제하는 것이 모델의 성능에 이롭습니다.

### 4. 원-핫 인코딩

원-핫 인코딩은 각 범주를 별도의 열로 변환하고, 해당 범주에 해당하는 곳에 1을, 나머지에는 0을 채우는 방식입니다. 예를 들어, 데이터가 "Red", "Green", "Blue"와 같은 문자열이라면, 모델은 이를 이해하지 못합니다. 범주형 데이터를 숫자로 변환해야 모델이 계산할 수 있습니다. 원-핫 인코딩은 범주형 데이터를 숫자로 변환하면서도 각 범주 간의 순서나 크기를 부여하지 않습니다.

In [408]:
df_copy = df.copy()
df_copy
# 다시 실행할 때 다시 덮어주기 위해

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [409]:
blood_map = {'A':0, 'B':1, 'AB':2, 'O':3}
df_copy['blood_code'] = df_copy['blood'].map(blood_map) # 라벨 인코딩
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand,blood_code
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302,0.0
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844,0.0
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081,0.0
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153,3.0
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048,2.0
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271,
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785,3.0
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460,0.0
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376,1.0
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866,0.0


In [410]:
pd.get_dummies(df_copy['blood'])

Unnamed: 0,A,AB,B,O
0,True,False,False,False
1,True,False,False,False
2,True,False,False,False
3,False,False,False,True
4,False,True,False,False
5,False,False,False,False
6,False,False,False,True
7,True,False,False,False
8,False,False,True,False
9,True,False,False,False


In [411]:
df_copy = df.copy()
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,blood,brand
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,A,6267302
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,A,5805844
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,A,4437081
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,O,4161153
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,AB,3470048
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,,2341271
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,O,2301785
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,A,2227460
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,B,2173376
9,태연,소녀시대,에스엠,여자,1989-03-09,,A,2079866


In [412]:
df_copy = pd.get_dummies(df_copy, columns=['blood'])
df_copy

Unnamed: 0,name,group,company,gender,birthday,height,brand,blood_A,blood_AB,blood_B,blood_O
0,지민,방탄소년단,빅히트,남자,1995-10-13,174.0,6267302,True,False,False,False
1,정국,방탄소년단,빅히트,남자,1997-09-01,179.0,5805844,True,False,False,False
2,민지,뉴진스,어도어,여자,2004-05-07,169.0,4437081,True,False,False,False
3,하니,뉴진스,어도어,여자,2004-10-06,161.7,4161153,False,False,False,True
4,뷔,방탄소년단,빅히트,남자,1995-12-30,179.0,3470048,False,True,False,False
5,다니엘,뉴진스,어도어,여자,2005-04-11,165.0,2341271,False,False,False,False
6,혜인,뉴진스,어도어,여자,2008-04-21,170.0,2301785,False,False,False,True
7,지수,블랙핑크,와이지,여자,1995-01-03,162.0,2227460,True,False,False,False
8,해린,뉴진스,어도어,여자,2006-05-15,164.5,2173376,False,False,True,False
9,태연,소녀시대,에스엠,여자,1989-03-09,,2079866,True,False,False,False


In [413]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      20 non-null     object 
 1   group     20 non-null     object 
 2   company   19 non-null     object 
 3   gender    20 non-null     object 
 4   birthday  20 non-null     object 
 5   height    19 non-null     float64
 6   brand     20 non-null     int64  
 7   blood_A   20 non-null     bool   
 8   blood_AB  20 non-null     bool   
 9   blood_B   20 non-null     bool   
 10  blood_O   20 non-null     bool   
dtypes: bool(4), float64(1), int64(1), object(5)
memory usage: 1.3+ KB
