# Pandas
- 파이썬 에서 데이터를 다룰수 있게 만들어진 라이브러리
- 2차원 배열 데이터에 대한 집계, 전처리를 굉장히 쉽게 할 수 있게 해준다.
- 정형 데이터를 다루는 라이브러리
- numpy 라이브러리를 내부적으로 그대로 활용한다.
- 대량의 데이터를 읽어드려 분석하는데 최적화
- 데이터 분석에 특화된 다양한 기능들을 제공
- 다른 시스템과 다른 라이브러리와 쉽게 연결(데이터베이스, 머신러닝 라이브러리)

# Pandas의 데이터 구조
- DataFrame: 2차원 구조로 되어있는 형식(행렬)
- Series: 1차원 구조로 되어있는 한 종류의 데이터(열 방향 벡터)

In [1]:
import numpy as np
import pandas as pd

- DataFrame 만들기

In [2]:
data = [
    ["A군", 43, 170],
    ["B군", 25, 180]
]

pd.DataFrame(data)

Unnamed: 0,0,1,2
0,A군,43,170
1,B군,25,180


In [3]:
data = [
    ["A군", 43, 170],
    ["B군", 25, 180]
]

df = pd.DataFrame(data, columns=["이름","나 이","키"])
df

Unnamed: 0,이름,나 이,키
0,A군,43,170
1,B군,25,180


In [4]:
data = [
    ["c군", 43, 170],
    ["d군", 25, 180]
]

In [5]:
np.array(data)

pd.DataFrame(data).iloc[:,]

Unnamed: 0,0,1,2
0,c군,43,170
1,d군,25,180


In [6]:
type(df)

pandas.core.frame.DataFrame

- Series

In [7]:
df["이름"]

0    A군
1    B군
Name: 이름, dtype: object

In [8]:
type(df["이름"])

pandas.core.series.Series

In [9]:
df["나 이"]

0    43
1    25
Name: 나 이, dtype: int64

In [10]:
data = {
    "이름" : ["A군","B군"],
    "나이" : [35,43]
}
pd.DataFrame(data)

Unnamed: 0,이름,나이
0,A군,35
1,B군,43


# CSV 파일 불러오기

In [11]:
#from google.colab import drive
#drive.mount("/content/drive")

In [12]:
DATA_PATH = "C:\\study\\01_numpy&pandas\\data"

```
passengerid	: 승객ID
survived: 생존여부(0: 사망, 1: 생존)
pclass: 객실 등급(1~3)
name: 이름
gender: 성별
age: 나이
sibsp: 형제 자매수
parch: 부모 자식수
ticket: 티켓번호
fare: 운임료
cabin: 객실번호
embarked: 탑승항구(C, S, Q)
```

- read_csv 함수
    - csv 파일을 읽어서 dataframe 객체로 반환

In [13]:
df = pd.read_csv(f"{DATA_PATH}\\titanic_train.csv")
df

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


- to_csv 메서드
    - dataframe 객체를 csv 파일로 저장

In [14]:
df.to_csv("data.csv",index=False)

# 데이터 프레임 기초 정보 확인하기

- 컬럼명 확인

In [15]:
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'gender', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [16]:
df.columns.tolist()

['passengerid',
 'survived',
 'pclass',
 'name',
 'gender',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked']

In [17]:
df["pclass"].tolist()

[1,
 3,
 3,
 2,
 1,
 3,
 3,
 3,
 3,
 1,
 3,
 2,
 2,
 2,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 2,
 1,
 3,
 1,
 3,
 3,
 1,
 2,
 1,
 3,
 3,
 3,
 3,
 1,
 1,
 3,
 3,
 2,
 1,
 2,
 3,
 3,
 1,
 2,
 3,
 3,
 2,
 1,
 2,
 2,
 1,
 3,
 2,
 3,
 1,
 3,
 3,
 1,
 3,
 2,
 3,
 1,
 3,
 1,
 1,
 3,
 2,
 3,
 2,
 1,
 3,
 3,
 2,
 3,
 2,
 1,
 3,
 2,
 2,
 2,
 2,
 3,
 1,
 2,
 3,
 3,
 3,
 1,
 3,
 2,
 2,
 3,
 1,
 2,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 1,
 2,
 3,
 3,
 3,
 3,
 3,
 2,
 3,
 2,
 2,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 2,
 1,
 3,
 2,
 1,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 3,
 1,
 3,
 3,
 3,
 2,
 1,
 3,
 3,
 3,
 2,
 2,
 3,
 3,
 1,
 3,
 3,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 3,
 3,
 3,
 2,
 1,
 3,
 1,
 2,
 2,
 3,
 3,
 3,
 1,
 1,
 3,
 3,
 3,
 3,
 1,
 3,
 1,
 3,
 2,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 3,
 3,
 3,
 3,
 2,
 3,
 1,
 3,
 1,
 1,
 2,
 3,
 3,
 3,
 3,
 3,
 2,
 3,
 3,
 1,
 3,
 3,
 3,
 3,
 1,
 3,
 3,
 2,
 3,
 3,
 3,
 3,
 3,


In [18]:
np.array(df["pclass"].tolist())

array([1, 3, 3, 2, 1, 3, 3, 3, 3, 1, 3, 2, 2, 2, 1, 1, 3, 3, 3, 3, 3, 3,
       1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 2, 1, 3, 1, 3, 3, 1, 2, 1, 3, 3, 3,
       3, 1, 1, 3, 3, 2, 1, 2, 3, 3, 1, 2, 3, 3, 2, 1, 2, 2, 1, 3, 2, 3,
       1, 3, 3, 1, 3, 2, 3, 1, 3, 1, 1, 3, 2, 3, 2, 1, 3, 3, 2, 3, 2, 1,
       3, 2, 2, 2, 2, 3, 1, 2, 3, 3, 3, 1, 3, 2, 2, 3, 1, 2, 3, 3, 3, 3,
       3, 1, 3, 3, 3, 1, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 1, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 1, 2, 1, 3, 2, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3,
       3, 1, 3, 3, 3, 2, 1, 3, 3, 3, 2, 2, 3, 3, 1, 3, 3, 1, 1, 1, 2, 2,
       1, 2, 1, 3, 3, 3, 2, 1, 3, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 3, 3, 1,
       3, 1, 3, 2, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3,
       3, 2, 3, 1, 3, 1, 1, 2, 3, 3, 3, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 1,
       3, 3, 2, 3, 3, 3, 3, 3, 2, 1, 1, 1, 3, 3, 3, 3, 3, 2, 1, 3, 1, 1,
       2, 2, 3, 2, 3, 1, 3, 3, 1, 3, 2, 3, 3, 3, 2, 1, 1, 1, 1, 2, 1, 3,
       1, 1, 3, 3, 1, 1, 2, 3, 1, 3, 3, 3, 3, 3, 3,

In [19]:
np.array(df["pclass"])

array([1, 3, 3, 2, 1, 3, 3, 3, 3, 1, 3, 2, 2, 2, 1, 1, 3, 3, 3, 3, 3, 3,
       1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 2, 1, 3, 1, 3, 3, 1, 2, 1, 3, 3, 3,
       3, 1, 1, 3, 3, 2, 1, 2, 3, 3, 1, 2, 3, 3, 2, 1, 2, 2, 1, 3, 2, 3,
       1, 3, 3, 1, 3, 2, 3, 1, 3, 1, 1, 3, 2, 3, 2, 1, 3, 3, 2, 3, 2, 1,
       3, 2, 2, 2, 2, 3, 1, 2, 3, 3, 3, 1, 3, 2, 2, 3, 1, 2, 3, 3, 3, 3,
       3, 1, 3, 3, 3, 1, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 1, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 1, 2, 1, 3, 2, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3,
       3, 1, 3, 3, 3, 2, 1, 3, 3, 3, 2, 2, 3, 3, 1, 3, 3, 1, 1, 1, 2, 2,
       1, 2, 1, 3, 3, 3, 2, 1, 3, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 3, 3, 1,
       3, 1, 3, 2, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3,
       3, 2, 3, 1, 3, 1, 1, 2, 3, 3, 3, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 1,
       3, 3, 2, 3, 3, 3, 3, 3, 2, 1, 1, 1, 3, 3, 3, 3, 3, 2, 1, 3, 1, 1,
       2, 2, 3, 2, 3, 1, 3, 3, 1, 3, 2, 3, 3, 3, 2, 1, 1, 1, 1, 2, 1, 3,
       1, 1, 3, 3, 1, 1, 2, 3, 1, 3, 3, 3, 3, 3, 3,

- 데이터 프레임 정보 확인

In [20]:
df.info() # verbose=True, show_counts=True

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  916 non-null    int64  
 1   survived     916 non-null    int64  
 2   pclass       916 non-null    int64  
 3   name         916 non-null    object 
 4   gender       916 non-null    object 
 5   age          736 non-null    float64
 6   sibsp        916 non-null    int64  
 7   parch        916 non-null    int64  
 8   ticket       916 non-null    object 
 9   fare         916 non-null    float64
 10  cabin        210 non-null    object 
 11  embarked     916 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 86.0+ KB


In [21]:
df.shape

(916, 12)

In [22]:
df.head(20)

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.05,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.025,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S
5,952,0,3,"Dika, Mr. Mirko",male,17.0,0,0,349232,7.8958,,S
6,130,0,3,"Ekstrom, Mr. Johan",male,45.0,0,0,347061,6.975,,S
7,331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q
8,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S
9,119,0,1,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,247.5208,B58 B60,C


In [23]:
df.tail()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S
915,146,0,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,S


# 데이터 프레임 다루기

- copy 메서드
    - 데이터프레임을 복사(깊은 복사)

In [24]:
df_cp = df.copy()
df_cp

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


- 컬럼명 변경하기

In [25]:
type(df_cp.columns )

pandas.core.indexes.base.Index

In [26]:
# 한번에 변경
cols = ["id", "생존", "객실등급", "이름", "성별", "나이", "형제자매_배우자수",
        "부모자식수", "티켓번호", "요금", "객실번호", "탑승항구"]
df_cp.columns = cols
df_cp.head()

Unnamed: 0,id,생존,객실등급,이름,성별,나이,형제자매_배우자수,부모자식수,티켓번호,요금,객실번호,탑승항구
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.05,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.025,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S


In [27]:
# df_cp.columns[2] = "객실" # 에러 난다.

- rename 메서드
    - 지정한 컬럼명들을 변경할 수 있음
    - 딕셔너리 형태로 인수를 전달하면 된다.
        - key: 변경전 컬럼이름
        - value: 변경후 컬럼이름

In [28]:
rename_dict = {"id": "아이디"}
df_cp.rename(columns=rename_dict)

Unnamed: 0,아이디,생존,객실등급,이름,성별,나이,형제자매_배우자수,부모자식수,티켓번호,요금,객실번호,탑승항구
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


- add_prefix
    - 컬럼명 앞부분에 공통된 문자열을 붙혀준다.

In [29]:
import pandas as pd


df.add_prefix("num_")

Unnamed: 0,num_passengerid,num_survived,num_pclass,num_name,num_gender,num_age,num_sibsp,num_parch,num_ticket,num_fare,num_cabin,num_embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


- add_suffix
    - 컬럼명 뒷부분에 공통된 문자열을 붙혀준다.

In [30]:
df.add_suffix("_suf")

Unnamed: 0,passengerid_suf,survived_suf,pclass_suf,name_suf,gender_suf,age_suf,sibsp_suf,parch_suf,ticket_suf,fare_suf,cabin_suf,embarked_suf
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


- 특정 컬럼들 선택

In [31]:
cols = ["name","gender","age"]
df[cols]

Unnamed: 0,name,gender,age
0,"Artagaveytia, Mr. Ramon",male,71.0
1,"Morley, Mr. William",male,34.0
2,"Kink-Heilmann, Mr. Anton",male,29.0
3,"Hiltunen, Miss. Marta",female,18.0
4,"Anderson, Mr. Harry",male,48.0
...,...,...,...
911,"Lesurer, Mr. Gustave J",male,35.0
912,"Ryan, Mr. Patrick",male,
913,"Coleff, Mr. Peju",male,36.0
914,"Rekic, Mr. Tido",male,38.0


In [32]:
target = df["survived"]
target

0      0
1      0
2      0
3      1
4      1
      ..
911    1
912    0
913    0
914    0
915    0
Name: survived, Length: 916, dtype: int64

- 컬럼 삭제하기

In [33]:
# axis 를 사용한 방법
df.drop("survived", axis=1) # 열방향

Unnamed: 0,passengerid,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


In [34]:
df.drop(["name","survived"], axis=1) # 열방향

Unnamed: 0,passengerid,pclass,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,1,male,71.0,0,0,PC 17609,49.5042,,C
1,462,3,male,34.0,0,0,364506,8.0500,,S
2,1286,3,male,29.0,3,1,315153,22.0250,,S
3,1130,2,female,18.0,1,1,250650,13.0000,,S
4,461,1,male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...
911,738,1,male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,3,male,,0,0,371110,24.1500,,Q
913,664,3,male,36.0,0,0,349210,7.4958,,S
914,109,3,male,38.0,0,0,349249,7.8958,,S


In [35]:
# df.drop(3,axis=0) # 행방향

In [36]:
df.drop(columns=["name","age"])

Unnamed: 0,passengerid,survived,pclass,gender,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,male,0,0,PC 17609,49.5042,,C
1,462,0,3,male,0,0,364506,8.0500,,S
2,1286,0,3,male,3,1,315153,22.0250,,S
3,1130,1,2,female,1,1,250650,13.0000,,S
4,461,1,1,male,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,male,0,0,PC 17755,512.3292,B101,C
912,518,0,3,male,0,0,371110,24.1500,,Q
913,664,0,3,male,0,0,349210,7.4958,,S
914,109,0,3,male,0,0,349249,7.8958,,S


- 컬럼을 기준으로 정렬

In [37]:
df.sort_values(by="age") # 기본 오름차순

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
852,1246,1,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.5750,,S
480,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5000,,S
289,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
41,1173,0,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.7750,,S
587,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
881,508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.5500,,S
893,668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.7750,,S
895,66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
897,458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S


In [38]:
df.sort_values(by="age", ascending=False) # 내림차순

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
694,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0000,A23,S
729,988,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1,0,19877,78.8500,C46,S
375,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
22,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
...,...,...,...,...,...,...,...,...,...,...,...,...
881,508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.5500,,S
893,668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.7750,,S
895,66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
897,458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S


In [39]:
df.sort_values(by=["age","gender"], ascending=False)

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
694,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0000,A23,S
729,988,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1,0,19877,78.8500,C46,S
375,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
22,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
...,...,...,...,...,...,...,...,...,...,...,...,...
755,236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.5500,,S
759,928,1,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.0500,,S
793,110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.1500,,Q
834,416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.0500,,S


In [40]:
df.sort_values(by=["age", "gender"], ascending=[True, False])


Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
852,1246,1,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.5750,,S
480,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5000,,S
41,1173,0,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.7750,,S
289,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
587,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
755,236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.5500,,S
759,928,1,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.0500,,S
793,110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.1500,,Q
834,416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.0500,,S


- 데이터 프레임 섞기


In [41]:
df2 = df.sample(frac=1,random_state=42)
df2

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
380,78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.0500,,S
879,274,0,1,"Natsch, Mr. Charles H",male,37.0,0,1,PC 17596,29.7000,C118,C
355,782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17.0,1,0,17474,57.0000,B20,S
357,956,0,1,"Ryerson, Master. John Borie",male,13.0,2,2,PC 17608,262.3750,B57 B59 B63 B66,C
362,1234,0,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.5500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
106,1249,0,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S
270,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C
860,1114,1,2,"Cook, Mrs. (Selena Rogers)",female,22.0,0,0,W./C. 14266,10.5000,F33,S
435,86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu...",female,33.0,3,0,3101278,15.8500,,S


# 행렬 슬라이싱

In [42]:
df = pd.read_csv(f"{DATA_PATH}\\titanic_train.csv")
df

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


## iloc
- 행번호, 열번호를 이용한 슬라이싱

In [43]:
df.iloc[:4,1:3]

Unnamed: 0,survived,pclass
0,0,1
1,0,3
2,0,3
3,1,2


In [44]:
index_list = [0,2,3,5]
df.iloc[index_list]

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.025,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0,,S
5,952,0,3,"Dika, Mr. Mirko",male,17.0,0,0,349232,7.8958,,S


In [45]:
df.iloc[:,::-1]

Unnamed: 0,embarked,cabin,fare,ticket,parch,sibsp,age,gender,name,pclass,survived,passengerid
0,C,,49.5042,PC 17609,0,0,71.0,male,"Artagaveytia, Mr. Ramon",1,0,494
1,S,,8.0500,364506,0,0,34.0,male,"Morley, Mr. William",3,0,462
2,S,,22.0250,315153,1,3,29.0,male,"Kink-Heilmann, Mr. Anton",3,0,1286
3,S,,13.0000,250650,1,1,18.0,female,"Hiltunen, Miss. Marta",2,1,1130
4,S,E12,26.5500,19952,0,0,48.0,male,"Anderson, Mr. Harry",1,1,461
...,...,...,...,...,...,...,...,...,...,...,...,...
911,C,B101,512.3292,PC 17755,0,0,35.0,male,"Lesurer, Mr. Gustave J",1,1,738
912,Q,,24.1500,371110,0,0,,male,"Ryan, Mr. Patrick",3,0,518
913,S,,7.4958,349210,0,0,36.0,male,"Coleff, Mr. Peju",3,0,664
914,S,,7.8958,349249,0,0,38.0,male,"Rekic, Mr. Tido",3,0,109


In [46]:
display(df.iloc[-1,:])

passengerid                             146
survived                                  0
pclass                                    2
name           Nicholls, Mr. Joseph Charles
gender                                 male
age                                    19.0
sibsp                                     1
parch                                     1
ticket                           C.A. 33112
fare                                  36.75
cabin                                   NaN
embarked                                  S
Name: 915, dtype: object

In [48]:
df.index[:-1]

RangeIndex(start=0, stop=915, step=1)

In [47]:
value = df.loc[df.index[:-1], "pclass"]
value

0      1
1      3
2      3
3      2
4      1
      ..
910    3
911    1
912    3
913    3
914    3
Name: pclass, Length: 915, dtype: int64

In [49]:
df.index[-1]

915

## loc
- index 명과 column명을 이용한 행렬 선택
- 마스킹을 이용한 행렬 선택 가능

In [50]:
df[:100]

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
95,266,0,2,"Reeves, Mr. David",male,36.0,0,0,C.A. 17248,10.5000,,S
96,281,0,3,"Duane, Mr. Frank",male,65.0,0,0,336439,7.7500,,Q
97,504,0,3,"Laitinen, Miss. Kristina Sofia",female,37.0,0,0,4135,9.5875,,S
98,576,0,3,"Patchett, Mr. George",male,19.0,0,0,358585,14.5000,,S


In [51]:
df.loc[:,"pclass"]

0      1
1      3
2      3
3      2
4      1
      ..
911    1
912    3
913    3
914    3
915    2
Name: pclass, Length: 916, dtype: int64

In [52]:
df.loc[:, :"pclass"] # end 값 포함되서 슬라이싱 됨

Unnamed: 0,passengerid,survived,pclass
0,494,0,1
1,462,0,3
2,1286,0,3
3,1130,1,2
4,461,1,1
...,...,...,...
911,738,1,1
912,518,0,3
913,664,0,3
914,109,0,3


In [53]:
df.loc[:,"pclass"]

0      1
1      3
2      3
3      2
4      1
      ..
911    1
912    3
913    3
914    3
915    2
Name: pclass, Length: 916, dtype: int64

In [54]:
df.loc[:,["pclass"]]

Unnamed: 0,pclass
0,1
1,3
2,3
3,2
4,1
...,...
911,1
912,3
913,3
914,3


In [55]:
index_list = [458,100,50]
column_list = ["name", "fare"]
df.loc[index_list, column_list]

Unnamed: 0,name,fare
458,"Lingane, Mr. John",12.35
100,"Bostandyeff, Mr. Guentcho",7.8958
50,"Allen, Miss. Elisabeth Walton",211.3375


- 행번호에 맞게 새로운 인덱스 생성하기

In [56]:
df.reset_index(drop=True) #drop=True: 기존 인덱스가 삭제되고 새로운 정수 인덱스가 생성됩니다.

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


- 마스킹 해보기

In [57]:
mask = df["survived"] > 0 # 생존자만 찾기
mask

0      False
1      False
2      False
3       True
4       True
       ...  
911     True
912    False
913    False
914    False
915    False
Name: survived, Length: 916, dtype: bool

In [58]:
df.loc[mask]

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
7,331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.2500,,Q
11,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
13,802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
905,982,1,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judi...",female,22.0,1,0,347072,13.9000,,S
906,339,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.0500,,S
907,225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38.0,1,0,19943,90.0000,C93,S
908,377,1,3,"Landergren, Miss. Aurora Adelia",female,22.0,0,0,C 7077,7.2500,,S


In [59]:
mask = df["pclass"] == 1
df.loc[mask]

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
9,119,0,1,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,247.5208,B58 B60,C
14,914,1,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
15,1215,0,1,"Rowe, Mr. Alfred G",male,33.0,0,0,113790,26.5500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
888,642,1,1,"Sagesser, Mlle. Emma",female,24.0,0,0,PC 17477,69.3000,B35,C
897,458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S
899,497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54.0,1,0,36947,78.2667,D20,C
907,225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38.0,1,0,19943,90.0000,C93,S


# 다중 조건을 주어 마스킹
- DataFrame또는 numpy배열에 대하여 bool에 대한 논리 연산자를 지원하지 않는다.
- DataFrame또는 numpy배열에 대하여 bool에 대한 연산은 비트 연산자를 사용하면 된다.
- `and` 대신 `&`: 둘다 True이면 True 아니면 False
- `or` 대신 `|`: 둘중에 하나라도 True면 True 아니면 False
- `not` 대신 `~`: True 면 False, False 면 True

In [60]:
# 생존자 이면서 나이가 20세 미만
mask = (df["survived"] > 0) & (df["age"] < 20)
df.loc[mask]

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
30,1175,1,3,"Touma, Miss. Maria Youssef",female,9.0,1,1,2650,15.2458,,C
45,701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18.0,1,0,PC 17757,227.5250,C62 C64,C
55,1095,1,2,"Quick, Miss. Winifred Vera",female,8.0,1,1,26360,26.0000,,S
72,979,1,3,"Badman, Miss. Emily Louisa",female,18.0,0,0,A/4 31416,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
859,85,1,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S
867,856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.3500,,S
874,692,1,3,"Karun, Miss. Manca",female,4.0,0,1,349256,13.4167,,C
889,531,1,2,"Quick, Miss. Phyllis May",female,2.0,1,1,26360,26.0000,,S


In [61]:
mask1 = df["survived"] > 0
mask2 = df["age"] < 20
df.loc[mask1 | mask2]

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
5,952,0,3,"Dika, Mr. Mirko",male,17.0,0,0,349232,7.8958,,S
7,331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.2500,,Q
8,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.2750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
907,225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38.0,1,0,19943,90.0000,C93,S
908,377,1,3,"Landergren, Miss. Aurora Adelia",female,22.0,0,0,C 7077,7.2500,,S
909,1086,0,2,"Drew, Master. Marshall Brines",male,8.0,0,2,28220,32.5000,,S
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C


In [62]:
mask1 = df["survived"] > 0
mask2 = df["age"] < 20
df.loc[mask1 | mask2,["pclass","parch"]]

Unnamed: 0,pclass,parch
3,2,1
4,1,0
5,3,0
7,3,0
8,3,2
...,...,...
907,1,0
908,3,0
909,2,2
911,1,0


In [63]:
df.loc[~mask1]

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S
5,952,0,3,"Dika, Mr. Mirko",male,17.0,0,0,349232,7.8958,,S
6,130,0,3,"Ekstrom, Mr. Johan",male,45.0,0,0,347061,6.9750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
910,535,0,3,"Cacic, Miss. Marija",female,30.0,0,0,315084,8.6625,,S
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S


# 데이터 형식에 기반한 컬럼 선택

In [64]:
df.select_dtypes("float64")

Unnamed: 0,age,fare
0,71.0,49.5042
1,34.0,8.0500
2,29.0,22.0250
3,18.0,13.0000
4,48.0,26.5500
...,...,...
911,35.0,512.3292
912,,24.1500
913,36.0,7.4958
914,38.0,7.8958


In [65]:
df.select_dtypes(["float64", "int64"])

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare
0,494,0,1,71.0,0,0,49.5042
1,462,0,3,34.0,0,0,8.0500
2,1286,0,3,29.0,3,1,22.0250
3,1130,1,2,18.0,1,1,13.0000
4,461,1,1,48.0,0,0,26.5500
...,...,...,...,...,...,...,...
911,738,1,1,35.0,0,0,512.3292
912,518,0,3,,0,0,24.1500
913,664,0,3,36.0,0,0,7.4958
914,109,0,3,38.0,0,0,7.8958


In [66]:
df.select_dtypes("number")

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare
0,494,0,1,71.0,0,0,49.5042
1,462,0,3,34.0,0,0,8.0500
2,1286,0,3,29.0,3,1,22.0250
3,1130,1,2,18.0,1,1,13.0000
4,461,1,1,48.0,0,0,26.5500
...,...,...,...,...,...,...,...
911,738,1,1,35.0,0,0,512.3292
912,518,0,3,,0,0,24.1500
913,664,0,3,36.0,0,0,7.4958
914,109,0,3,38.0,0,0,7.8958


In [67]:
df.select_dtypes("object")

Unnamed: 0,name,gender,ticket,cabin,embarked
0,"Artagaveytia, Mr. Ramon",male,PC 17609,,C
1,"Morley, Mr. William",male,364506,,S
2,"Kink-Heilmann, Mr. Anton",male,315153,,S
3,"Hiltunen, Miss. Marta",female,250650,,S
4,"Anderson, Mr. Harry",male,19952,E12,S
...,...,...,...,...,...
911,"Lesurer, Mr. Gustave J",male,PC 17755,B101,C
912,"Ryan, Mr. Patrick",male,371110,,Q
913,"Coleff, Mr. Peju",male,349210,,S
914,"Rekic, Mr. Tido",male,349249,,S


# 그동안 배운 문자열 간의 연산과 수치 간의 연산들이 Series 단위로 지원 된다.

In [68]:
df["가족수"] = df["sibsp"] +  df["parch"]
df

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked,가족수
0,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,0
1,462,0,3,"Morley, Mr. William",male,34.0,0,0,364506,8.0500,,S,0
2,1286,0,3,"Kink-Heilmann, Mr. Anton",male,29.0,3,1,315153,22.0250,,S,4
3,1130,1,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0000,,S,2
4,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C,0
912,518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.1500,,Q,0
913,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.4958,,S,0
914,109,0,3,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S,0


In [69]:
df["gender"] + "_" + df["name"]

0           male_Artagaveytia, Mr. Ramon
1               male_Morley, Mr. William
2          male_Kink-Heilmann, Mr. Anton
3           female_Hiltunen, Miss. Marta
4               male_Anderson, Mr. Harry
                     ...                
911          male_Lesurer, Mr. Gustave J
912               male_Ryan, Mr. Patrick
913                male_Coleff, Mr. Peju
914                 male_Rekic, Mr. Tido
915    male_Nicholls, Mr. Joseph Charles
Length: 916, dtype: object

In [70]:
replace_dict = {"male":1, "female":0}

In [71]:
df[["gender"]]

Unnamed: 0,gender
0,male
1,male
2,male
3,female
4,male
...,...
911,male
912,male
913,male
914,male


In [72]:
df["gender"]

0        male
1        male
2        male
3      female
4        male
        ...  
911      male
912      male
913      male
914      male
915      male
Name: gender, Length: 916, dtype: object

In [73]:
df["gender"].map(lambda x : 1 if x == "male" else 0 )

0      1
1      1
2      1
3      0
4      1
      ..
911    1
912    1
913    1
914    1
915    1
Name: gender, Length: 916, dtype: int64

In [74]:
df["gender"]

0        male
1        male
2        male
3      female
4        male
        ...  
911      male
912      male
913      male
914      male
915      male
Name: gender, Length: 916, dtype: object

In [75]:
df.loc[:,["gender","name"]]

Unnamed: 0,gender,name
0,male,"Artagaveytia, Mr. Ramon"
1,male,"Morley, Mr. William"
2,male,"Kink-Heilmann, Mr. Anton"
3,female,"Hiltunen, Miss. Marta"
4,male,"Anderson, Mr. Harry"
...,...,...
911,male,"Lesurer, Mr. Gustave J"
912,male,"Ryan, Mr. Patrick"
913,male,"Coleff, Mr. Peju"
914,male,"Rekic, Mr. Tido"
