In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

## 1. Lambda

### to Series

In [2]:
s1 = Series(np.arange(10))
s1.head(5)

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [3]:
# Map test
s1.map(lambda x: x**2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [4]:
s1

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [5]:
# dict map to handling series
z = {1: 'A', 2: 'B', 3: 'C'}
s1.map(z)

0    NaN
1      A
2      B
3      C
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: object

In [6]:
# wget이 없으면 실행이 안되므로 그냥 다운로드 받아도 됨
!wget https://raw.githubusercontent.com/rstudio/Intro/master/data/wages.csv

'wget'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


In [7]:
df = pd.read_csv("./data/wages.csv")
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [8]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [9]:
list(df.sex.unique())

['male', 'female']

In [10]:
df["sex_code"] = df.sex.map({"male":0, "female":1})
df.head()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,male,white,16,49,0
1,96396.988643,66.23,female,white,16,62,1
2,48710.666947,63.77,female,white,16,33,1
3,80478.096153,63.22,female,other,16,95,1
4,82089.345498,63.08,female,white,17,43,1


In [11]:
df.sex.replace(
    {"male":0, "female":1}
).head()

0    0
1    1
2    1
3    1
4    1
Name: sex, dtype: int64

In [12]:
df.sex.replace(
    df.sex.unique(),
    [0, 1],
    inplace=True
)
df.head()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,0,white,16,49,0
1,96396.988643,66.23,1,white,16,62,1
2,48710.666947,63.77,1,white,16,33,1
3,80478.096153,63.22,1,other,16,95,1
4,82089.345498,63.08,1,white,17,43,1


In [13]:
del df['sex_code']
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,0,white,16,49
1,96396.988643,66.23,1,white,16,62
2,48710.666947,63.77,1,white,16,33
3,80478.096153,63.22,1,other,16,95
4,82089.345498,63.08,1,white,17,43


## 2. apply

* 하나의 시리즈 데이터 전체를 받아오게 된다?

In [18]:
df_info = df[['earn', 'height', 'age']]
df_info.head()

Unnamed: 0,earn,height,age
0,79571.299011,73.89,49
1,96396.988643,66.23,62
2,48710.666947,63.77,33
3,80478.096153,63.22,95
4,82089.345498,63.08,43


In [19]:
f = lambda x: x.max() - x.min()
df_info.apply(f)

earn      318047.708444
height        19.870000
age           73.000000
dtype: float64

In [20]:
df_info.sum()

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

In [21]:
df_info.apply(sum)

earn      4.474344e+07
height    9.183125e+04
age       6.250800e+04
dtype: float64

In [22]:
# std: standard deviation : 표준편차
df_info.std()

earn      31257.070006
height        3.818108
age          15.789715
dtype: float64

In [23]:
def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])
df_info.apply(f)

Unnamed: 0,earn,height,age
min,-98.580489,57.34,22
max,317949.127955,77.21,95


### applymap for datafram

* series 단위가 아닌 element 단위로 함수를 적용
* series 단위에 apply랑 같은 효과

In [24]:
# applymap
f = lambda x: -x
df_info.applymap(f).head()

Unnamed: 0,earn,height,age
0,-79571.299011,-73.89,-49
1,-96396.988643,-66.23,-62
2,-48710.666947,-63.77,-33
3,-80478.096153,-63.22,-95
4,-82089.345498,-63.08,-43


In [25]:
# apply a series
f = lambda x: -x
df_info["earn"].apply(f).head()

0   -79571.299011
1   -96396.988643
2   -48710.666947
3   -80478.096153
4   -82089.345498
Name: earn, dtype: float64