# Indexing, 데이터 조작, Index 조작
>Pandas 데이터의 index에 관한 다양한 조작법을 공부한다.

- loc() : 라벨값 기반의 2차원 인덱싱
- iloc() : 순서를 나타내는 정수 기반의 2차원 인덱싱

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## loc 익히기
>type을 확인해보자.

In [2]:
# df.loc[행 인덱싱값]
# df.loc[행 인덱싱값, 열 인덱싱값]

sample_df = pd.DataFrame(np.arange(10, 22).reshape(3,4),
                        index=['a','b','c'],
                        columns=['A','B','C','D'])
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [3]:
display(sample_df.loc['a'])
display(type(sample_df.loc['a']))
print('*'*50)

display(sample_df.loc['a'].values)
display(type(sample_df.loc['a'].values))

A    10
B    11
C    12
D    13
Name: a, dtype: int32

pandas.core.series.Series

**************************************************


array([10, 11, 12, 13])

numpy.ndarray

In [4]:
# sample_df.loc['b' : 'c']

# sample_df['b' : 'c']

sample_df.loc[['b' , 'c']]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [5]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [6]:
# sample_df['A']
sample_df.A

a    10
b    14
c    18
Name: A, dtype: int32

In [7]:
display(type(sample_df.A))
display(type(sample_df.A > 15))

pandas.core.series.Series

pandas.core.series.Series

In [8]:
sample_df.loc[sample_df.A > 15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [9]:
# error : loc[]에는 존재하는 인덱스를 입력해야한다.
# sample_df.loc[1] or sample_df.loc['d']

In [10]:
sample_df2=pd.DataFrame(np.arange(10,26).reshape(4,4),
                       columns=['A','B','C','D'])
display(sample_df2)
display(sample_df2.loc[1:2])

Unnamed: 0,A,B,C,D
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


Unnamed: 0,A,B,C,D
1,14,15,16,17
2,18,19,20,21


In [11]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [12]:
sample_df.loc['a', 'A']

10

In [13]:
sample_df.loc['b':'c', 'A']

b    14
c    18
Name: A, dtype: int32

In [14]:
display(sample_df.loc['a', ])
display(type(sample_df.loc['a', ]))

A    10
B    11
C    12
D    13
Name: a, dtype: int32

pandas.core.series.Series

In [15]:
sample_df.loc['b':, 'C':]

Unnamed: 0,C,D
b,16,17
c,20,21


In [16]:
sample_df.loc[sample_df.A>10,['C','D']]

Unnamed: 0,C,D
b,16,17
c,20,21


## iloc 익히기

In [17]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [18]:
sample_df.iloc[0, 1]

11

In [19]:
sample_df.iloc[ : , 1]

a    11
b    15
c    19
Name: B, dtype: int32

In [20]:
sample_df.iloc[0, -2:]

C    12
D    13
Name: a, dtype: int32

In [21]:
sample_df.iloc[-1, 1:3]

B    19
C    20
Name: c, dtype: int32

In [81]:
sample_df.iloc[-1] = sample_df.iloc[-1] * 2
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,36,38,40,42


### 데이터 갯수 세어보기 :  count

In [23]:
s = pd.Series(range(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [24]:
s[5] = np.NaN
s[2] = np.NAN
s.count()

8

In [25]:
np.random.seed(2)
count_df = pd.DataFrame(np.random.randint(5, size=(4,4)), dtype=np.float)
count_df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,4.0
3,4.0,3.0,4.0,2.0


In [26]:
count_df.iloc[1, 0] = np.NaN
count_df.iloc[2, 3] = np.NaN
count_df.iloc[2, 3] = np.NaN

In [27]:
count_df.count()

0    3
1    4
2    4
3    3
dtype: int64

In [2]:
import seaborn as sns
titanic = sns.load_dataset('titanic')

In [3]:
display(type(titanic))
print('*'*50)

display(titanic.info())
print('*'*50)

display(titanic.values)

pandas.core.frame.DataFrame

**************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


None

**************************************************


array([[0, 3, 'male', ..., 'Southampton', 'no', False],
       [1, 1, 'female', ..., 'Cherbourg', 'yes', False],
       [1, 3, 'female', ..., 'Southampton', 'yes', True],
       ...,
       [0, 3, 'female', ..., 'Southampton', 'no', False],
       [1, 1, 'male', ..., 'Cherbourg', 'yes', True],
       [0, 3, 'male', ..., 'Queenstown', 'no', True]], dtype=object)

In [4]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

### value_counts()

In [5]:
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [6]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [7]:
type(titanic['pclass'])

pandas.core.series.Series

In [8]:
titanic['pclass'].value_counts()
# 특정 Series의 value들을 count

3    491
1    216
2    184
Name: pclass, dtype: int64

In [9]:
titanic['pclass'].value_counts().values

array([491, 216, 184], dtype=int64)

In [10]:
# 새로운 열 추가 age_0 일괄적으로 0 할당
titanic['age_0'] = 0

In [11]:
# age의 각 값에 10을 곱한 age_by_10 컬럼 생성
titanic['age_by_10'] = titanic['age']*10
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,220.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,380.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,260.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,350.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,350.0


In [12]:
# parch와 sibSp의 값과 1을 더한 family_no 컬럼 생성
titanic['family_no'] = titanic['parch'] + titanic['sibsp'] + 1

In [13]:
# age_by_10 컬럼 값에 일괄적으로 +100 처리
titanic['age_by_10'] = titanic['age_by_10'] + 100

In [14]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,320.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,480.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,360.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,450.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,450.0,1


In [15]:
# age_0 열을 삭제하기
titanic_drop_df = titanic.drop('age_0', axis=1)

In [16]:
titanic_drop_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_by_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,320.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,480.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,360.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,450.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,450.0,1


In [17]:
# 원본에서 age_0, age_by_10, family_no 칼럼을 삭제
titanic.drop(['age_0', 'age_by_10', 'family_no'], axis=1, inplace=True)

In [18]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [19]:
# 0, 1, 2 번째 행을 삭제하여 원본 프레임에 반영하기
titanic.drop([0,1,2], axis=0, inplace=True)

In [20]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [21]:
display(type(titanic.index.values))
display(titanic.index.shape)

numpy.ndarray

(888,)

### index에 대한 slicing, indexing

In [22]:
# index 5개를 꺼내오기
titanic.index[:5].values

array([3, 4, 5, 6, 7], dtype=int64)

In [23]:
# 6 index 를 꺼내오기
titanic.index[6]

9

In [24]:
series_fair = titanic['fare']
print('series value', series_fair)
print('type', type(series_fair))

series value 3       53.1000
4        8.0500
5        8.4583
6       51.8625
7       21.0750
8       11.1333
9       30.0708
10      16.7000
11      26.5500
12       8.0500
13      31.2750
14       7.8542
15      16.0000
16      29.1250
17      13.0000
18      18.0000
19       7.2250
20      26.0000
21      13.0000
22       8.0292
23      35.5000
24      21.0750
25      31.3875
26       7.2250
27     263.0000
28       7.8792
29       7.8958
30      27.7208
31     146.5208
32       7.7500
         ...   
861     11.5000
862     25.9292
863     69.5500
864     13.0000
865     13.0000
866     13.8583
867     50.4958
868      9.5000
869     11.1333
870      7.8958
871     52.5542
872      5.0000
873      9.0000
874     24.0000
875      7.2250
876      9.8458
877      7.8958
878      7.8958
879     83.1583
880     26.0000
881      7.8958
882     10.5167
883     10.5000
884      7.0500
885     29.1250
886     13.0000
887     30.0000
888     23.4500
889     30.0000
890      7.7500
Name: fare,

In [39]:
# max, min, sum
print( 'max', series_fair.max())
print( 'min', series_fair.min())
print( 'sum', series_fair.sum())
print( 'sum', np.sum(series_fair))
print("*"*50)

print( 'DC 10%', series_fair * 0.9)

max 512.3292
min 0.0
sum 28607.491
sum 28607.491
**************************************************
DC 10% 3       47.79000
4        7.24500
5        7.61247
6       46.67625
7       18.96750
8       10.01997
9       27.06372
10      15.03000
11      23.89500
12       7.24500
13      28.14750
14       7.06878
15      14.40000
16      26.21250
17      11.70000
18      16.20000
19       6.50250
20      23.40000
21      11.70000
22       7.22628
23      31.95000
24      18.96750
25      28.24875
26       6.50250
27     236.70000
28       7.09128
29       7.10622
30      24.94872
31     131.86872
32       6.97500
         ...    
861     10.35000
862     23.33628
863     62.59500
864     11.70000
865     11.70000
866     12.47247
867     45.44622
868      8.55000
869     10.01997
870      7.10622
871     47.29878
872      4.50000
873      8.10000
874     21.60000
875      6.50250
876      8.86122
877      7.10622
878      7.10622
879     74.84247
880     23.40000
881      7.10622
882      

### reset_index() : 새로운 인덱스를 할당하고, 기존 인덱스는 인덱스

In [40]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [41]:
titanic_reset_index_df = titanic.reset_index()
titanic_reset_index_df.head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
1,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
2,5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
3,6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
4,7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [42]:
titanic_reset_index_df[titanic_reset_index_df['pclass']==3].head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
2,5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
4,7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
5,8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
7,10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False


In [43]:
titanic_reset_index_df.iloc[[4,6,8],[2,4,6]]

Unnamed: 0,pclass,age,parch
4,3,2.0,1
6,2,14.0,0
8,1,58.0,0


In [44]:
# age 가 60 이상인 정보만 추출하고 싶다면?
titanic_reset_index_df[titanic_reset_index_df['age'] > 60 ].head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
30,33,0,2,male,66.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
51,54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False
93,96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
113,116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
167,170,0,1,male,61.0,0,0,33.5,S,First,man,True,B,Southampton,no,True


In [45]:
# age 가 60 이상인 pclass, survived, who 만 추출하기
titanic_reset_index_df.loc[titanic_reset_index_df['age'] > 60,['pclass', 'survived', 'who']].head()

Unnamed: 0,pclass,survived,who
30,2,0,man
51,1,0,man
93,1,0,man
113,3,0,man
167,1,0,man


### 여러개의 복합 조건을 이용해서 볼링인덱스를 만드는 것도 가능
- and -> &
- or -> |
- not ->|,~

In [46]:
#  나이 60 이상, 선실등급이 1등급이고 성별이 여자인 데이터 추출
titanic_reset_index_df[(titanic_reset_index_df['age'] > 60)
                     & (titanic_reset_index_df['pclass'] == 1)
                     & (titanic_reset_index_df['sex'] == 'female')]

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
272,275,1,1,female,63.0,1,0,77.9583,S,First,woman,False,D,Southampton,yes,False
826,829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,,yes,True


## 정렬
- sort_index
- sort_values

In [47]:
np.random.seed(100)
sort_df = pd.DataFrame(np.random.randint(0,10,(6,4)))
sort_df

Unnamed: 0,0,1,2,3
0,8,8,3,7
1,7,0,4,2
2,5,2,2,2
3,1,0,8,4
4,0,9,6,2
5,4,1,5,3


In [48]:
sort_df.columns = ['A','B','C','D']
sort_df.index   = pd.date_range('20201014', periods=6)
sort_df

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-15,7,0,4,2
2020-10-16,5,2,2,2
2020-10-17,1,0,8,4
2020-10-18,0,9,6,2
2020-10-19,4,1,5,3


In [49]:
# error
# np.random.shuffle(sort_df.index)

# 순열 랜덤 치환
random_date = np.random.permutation(sort_df.index)
random_date

array(['2020-10-14T00:00:00.000000000', '2020-10-16T00:00:00.000000000',
       '2020-10-15T00:00:00.000000000', '2020-10-17T00:00:00.000000000',
       '2020-10-19T00:00:00.000000000', '2020-10-18T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [50]:
# index 재설정, column 순서 지정
sort_df2 = sort_df.reindex(index=random_date,
                           columns = ['B','A','C','D'])
sort_df2

Unnamed: 0,B,A,C,D
2020-10-14,8,8,3,7
2020-10-16,2,5,2,2
2020-10-15,0,7,4,2
2020-10-17,0,1,8,4
2020-10-19,1,4,5,3
2020-10-18,9,0,6,2


In [51]:
# axis = 0 : row, axis = 1 : col
sort_df2.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2020-10-14,7,3,8,8
2020-10-16,2,2,2,5
2020-10-15,2,4,0,7
2020-10-17,4,8,0,1
2020-10-19,3,5,1,4
2020-10-18,2,6,9,0


In [52]:
sort_df2.sort_index(axis=0, ascending=True)

Unnamed: 0,B,A,C,D
2020-10-14,8,8,3,7
2020-10-15,0,7,4,2
2020-10-16,2,5,2,2
2020-10-17,0,1,8,4
2020-10-18,9,0,6,2
2020-10-19,1,4,5,3


In [53]:
# 특정 컬럼 값으로 행 정렬
# sort_df2.sort_values(by='B', ascending=True)
sort_df2.sort_values(by=['B','A'], ascending=True)

Unnamed: 0,B,A,C,D
2020-10-17,0,1,8,4
2020-10-15,0,7,4,2
2020-10-19,1,4,5,3
2020-10-16,2,5,2,2
2020-10-14,8,8,3,7
2020-10-18,9,0,6,2


In [54]:
sort_df2.sum(axis=1)

2020-10-14    26
2020-10-16    11
2020-10-15    13
2020-10-17    13
2020-10-19    13
2020-10-18    17
dtype: int64

In [55]:
sort_df2['row_sum'] = sort_df2.sum(axis=1)
sort_df2

Unnamed: 0,B,A,C,D,row_sum
2020-10-14,8,8,3,7,26
2020-10-16,2,5,2,2,11
2020-10-15,0,7,4,2,13
2020-10-17,0,1,8,4,13
2020-10-19,1,4,5,3,13
2020-10-18,9,0,6,2,17


In [56]:
sort_df2.loc['col_sum', : ] = sort_df2.mean(axis=0)
sort_df2

Unnamed: 0,B,A,C,D,row_sum
2020-10-14 00:00:00,8.0,8.0,3.0,7.0,26.0
2020-10-16 00:00:00,2.0,5.0,2.0,2.0,11.0
2020-10-15 00:00:00,0.0,7.0,4.0,2.0,13.0
2020-10-17 00:00:00,0.0,1.0,8.0,4.0,13.0
2020-10-19 00:00:00,1.0,4.0,5.0,3.0,13.0
2020-10-18 00:00:00,9.0,0.0,6.0,2.0,17.0
col_sum,3.333333,4.166667,4.666667,3.333333,15.5


In [57]:
# 타이타닉호 승객의 평균 나이

print('타이타닉호 승객의 평균 나이 \n:',
      titanic_reset_index_df['age'].mean())
print('\n')

# 타이타닉호 승객 중 여성 승객의 평균 나이

female_index = titanic_reset_index_df['sex'] == 'female'
print('타이타닉호 승객 중 여성 승객의 평균 나이 \n:',
      titanic_reset_index_df.loc[female_index, 'age'].mean())
print('\n')

# 타이타닉호 승객 중 1등실 선실의 여성 승객의 평균 나이

pclass_index = titanic_reset_index_df['pclass'] == 1
wanted_index = female_index & pclass_index
print('타이타닉호 승객 중 1등실 선실의 여성 승객의 평균 나이 \n:',
     titanic_reset_index_df.loc[wanted_index, 'age'].mean())

타이타닉호 승객의 평균 나이 
: 29.703473980309422


타이타닉호 승객 중 여성 승객의 평균 나이 
: 27.884169884169886


타이타닉호 승객 중 1등실 선실의 여성 승객의 평균 나이 
: 34.57142857142857


In [58]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


## apply 변환
- 행이나 열 단위로 복잡한 데이터 가공이 필요한 경우 사용하는 함수
- lambda 식
- apply 함수는 인자로 함수를 넘겨 받을 수 있다.

In [59]:
def get_square(a) :
    return a**2

In [60]:
print('제곱근 : ', get_square(3))

제곱근 :  9


In [61]:
# 위 코드를 람다식으로 변경
lambda_square = lambda a : a**2
print("제곱근 : ",lambda_square(3))

제곱근 :  9


In [62]:
np.random.seed(100)
apply_df=pd.DataFrame(np.random.randint(0,10,(6,4)))
apply_df.columns = ['A','B','C','D']
apply_df.index = pd.date_range('20201014', periods=6)
apply_df

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-15,7,0,4,2
2020-10-16,5,2,2,2
2020-10-17,1,0,8,4
2020-10-18,0,9,6,2
2020-10-19,4,1,5,3


In [63]:
# 각 행의 column에 대해서 최대-최솟값을 구해 새로운 column 추가
# 각 column 안에서 최대 - 최솟값을 구해 철력
func = lambda x : x.max() - x.min()

In [64]:
apply_df['row max-min'] = apply_df.apply(func, axis = 1)
apply_df

Unnamed: 0,A,B,C,D,row max-min
2020-10-14,8,8,3,7,5
2020-10-15,7,0,4,2,7
2020-10-16,5,2,2,2,3
2020-10-17,1,0,8,4,8
2020-10-18,0,9,6,2,9
2020-10-19,4,1,5,3,4


In [65]:
# embark_town의 문자열 갯수를 별도의 컬럼인 embark_len 컬럼을 추가

# if ~ else 절을 활용하여 나이가 15세 이하면 child, 그렇지 않으면 adult 로 구분하는
# child_adult 추가하라

In [66]:
titanic_reset_index_df['embark_len'] = titanic_reset_index_df['embark_town'].apply(lambda x : len(str(x)))
titanic_reset_index_df[['embark_town', 'embark_len']].head(3)

Unnamed: 0,embark_town,embark_len
0,Southampton,11
1,Southampton,11
2,Queenstown,10


In [67]:
titanic_reset_index_df['child_adult'] = titanic_reset_index_df['age'].apply(lambda x : 'child' if x < 15 else 'adult')
titanic_reset_index_df[['age', 'child_adult']].head(8)

Unnamed: 0,age,child_adult
0,35.0,adult
1,35.0,adult
2,,adult
3,54.0,adult
4,2.0,child
5,27.0,adult
6,14.0,child
7,4.0,child


- lambda if ~ else 구문 형식
- lambda 매개변수 : true if 조건식 else false

In [71]:
# 15세 이하는 child, 15~60 까지는 adult, 61 이상은 elderly로 분류해서 age_division 컬럼에 저장
# lambda 식으로 apply 함수를 이용해서 구현하기
titanic_reset_index_df['age_division'] = titanic_reset_index_df['age'].apply(lambda x : 'child' if x <= 15 else ('adult' if x<= 60 else 'elderly'))
titanic_reset_index_df['age_division'].value_counts()

adult      606
elderly    199
child       83
Name: age_division, dtype: int64

In [73]:
# 나이에 따라 세분화된 분류를 수행하는 함수 생성
def get_category(age) :
    category = ""
    if age <= 5 :
        category = 'baby'
    elif age <= 12 :
        category = 'child'
    elif age <= 19 :
        category = 'teenager'
    elif age <= 24 :
        category = 'student'
    elif age <= 39 :
        category = 'young adult'
    elif age <= 60 :
        category = 'adult'
    else :
        category = 'elderly'
    return category

In [77]:
titanic_reset_index_df['age_category'] = titanic_reset_index_df['age'].apply(lambda x : get_category(x))
titanic_reset_index_df['age_category'].value_counts()

young adult    272
elderly        199
adult          141
student        112
teenager        95
baby            44
child           25
Name: age_category, dtype: int64

In [79]:
# apply() 함수에 dataframe이 넘어오는 경우
titanic_reset_index_df['child/adult'] = titanic_reset_index_df.apply(lambda f : 'adult' if f.age >= 20 else 'child', axis=1)
titanic_reset_index_df['child/adult'].value_counts()

adult    547
child    341
Name: child/adult, dtype: int64

In [1]:
# 승객에 대한 나이와 성별에 의한 카테고리를 cat 으로 추가하기
# 조건1. 20살이 넘으면 성별을 그대로 사용
# 조건2. 20살 미만이면 성별에 관계없이 'child' 라고 정의
titanic_reset_index_df['cat']

func1 = lambda df : df.sex if df.age >= 20 else 'child'
func2 = lambda df : "I don't know" if pd.isnull(df.age) else func1(df) 

titanic['cat'] = titanic.apply(func2, axis=1)
titanic['cat'].value_counts()

NameError: name 'titanic_reset_index_df' is not defined