In [1]:
import pandas as pd

## 예제 8-4  사용자 정의 함수를 apply 메소드로  처리하기

In [2]:
import numpy as np

In [3]:
df = pd.DataFrame ({'a' : np.random.randn(6),
                 'b' : ['철수', '영희'] * 3,
                 'c' : np.random.randn(6)})
df

Unnamed: 0,a,b,c
0,1.154171,철수,0.033547
1,-1.565696,영희,-0.606764
2,-1.928421,철수,-0.81019
3,0.165173,영희,0.047483
4,-1.338291,철수,1.569175
5,1.558816,영희,-0.565091


In [4]:
def my_test(a, c):
    return a % c

lambda 함수는 행단위로 데이터를 처리하는 사용자 함수를 작성한다.

In [5]:
df['Value'] = df.apply(lambda df:my_test(df['a'],df['c']),axis = 1)
df

Unnamed: 0,a,b,c,Value
0,1.154171,철수,0.033547,0.013587
1,-1.565696,영희,-0.606764,-0.352167
2,-1.928421,철수,-0.81019,-0.30804
3,0.165173,영희,0.047483,0.022723
4,-1.338291,철수,1.569175,0.230884
5,1.558816,영희,-0.565091,-0.136456


In [6]:
df.a%df.c

0    0.013587
1   -0.352167
2   -0.308040
3    0.022723
4    0.230884
5   -0.136456
dtype: float64

일반함수는 데이터프레임 전체를 넘겨 열전체의 값을 반환하는 형태로 작성한다.

In [7]:
def my_test2(df):
    return df['a']%df['c']

In [8]:
df['value2'] = df.apply(my_test2,axis = 1)
df

Unnamed: 0,a,b,c,Value,value2
0,1.154171,철수,0.033547,0.013587,0.013587
1,-1.565696,영희,-0.606764,-0.352167,-0.352167
2,-1.928421,철수,-0.81019,-0.30804,-0.30804
3,0.165173,영희,0.047483,0.022723,0.022723
4,-1.338291,철수,1.569175,0.230884,0.230884
5,1.558816,영희,-0.565091,-0.136456,-0.136456


%timeit 를 통해 서로 다른방식에 대한 성능비교를 할 수 있다.

In [9]:
%timeit df['value3'] = df.apply(lambda df:my_test(df['a'],df['c']),axis = 1)

607 µs ± 3.12 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
%timeit df['value4'] = df.apply(my_test2,axis = 1)

610 µs ± 1.84 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
sample = pd.DataFrame({'임의의값':[10,100,40]})
sample

Unnamed: 0,임의의값
0,10
1,100
2,40


기존의 데이터 3개와 10만개의 데이터를 곱하여 30만개의 데이터를 생성한다.  
인덱스는 갱신되어야 하므로 reset_index 함수 인자에 drop = True 를 하면 새로운 인덱스로 갱신된다.

In [12]:
sample = pd.concat([sample]*100000).reset_index(drop=True)

In [13]:
sample.shape

(300000, 1)

In [14]:
sample.head(10)

Unnamed: 0,임의의값
0,10
1,100
2,40
3,10
4,100
5,40
6,10
7,100
8,40
9,10


In [15]:
%timeit sample['임의의값'].apply(lambda x:np.nan if x < 90 else x)

68.7 ms ± 440 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


자기자신의 값은 병경되지 않으므로 추후 복제해서 적용해 본다.

In [16]:
sample.head()

Unnamed: 0,임의의값
0,10
1,100
2,40
3,10
4,100


In [17]:
%timeit sample['임의의값'].mask(sample['임의의값']<90,np.nan)

2.99 ms ± 59.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
sample1 = sample.copy()

In [19]:
sample1 = sample1['임의의값'].apply(lambda x:np.nan if x < 90 else x)

In [20]:
sample1.head()

0      NaN
1    100.0
2      NaN
3      NaN
4    100.0
Name: 임의의값, dtype: float64

In [21]:
sample2 = sample.copy()

In [22]:
sample2 = sample2['임의의값'].mask(sample['임의의값'] < 90, np.nan)

In [23]:
sample2.head()

0      NaN
1    100.0
2      NaN
3      NaN
4    100.0
Name: 임의의값, dtype: float64

In [24]:
(sample1 == sample2).shape

(300000,)

In [25]:
sample1.isnull().sum(), sample1.notnull().sum()

(200000, 100000)

In [26]:
(sample1 == sample2).sum()

100000

## 예제 8-4  사용자 정의 함수를 map,applymap메소드로  처리하기

In [27]:
import numpy as np

In [28]:
ser = pd.Series(np.random.randn(6))
ser

0    0.579362
1   -0.428834
2   -1.507811
3    1.369020
4   -1.053264
5   -0.119618
dtype: float64

In [29]:
df = pd.DataFrame(ser,columns=['관측값_A'])
df

Unnamed: 0,관측값_A
0,0.579362
1,-0.428834
2,-1.507811
3,1.36902
4,-1.053264
5,-0.119618


abs => 절대값

In [30]:
def map_test(a):
    print('원소별 처리')
    return np.abs(a)

In [31]:
ser.map(map_test)

원소별 처리
원소별 처리
원소별 처리
원소별 처리
원소별 처리
원소별 처리


0    0.579362
1    0.428834
2    1.507811
3    1.369020
4    1.053264
5    0.119618
dtype: float64

In [33]:
s = pd.Series(ser.map(map_test))
s

원소별 처리
원소별 처리
원소별 처리
원소별 처리
원소별 처리
원소별 처리


0    0.579362
1    0.428834
2    1.507811
3    1.369020
4    1.053264
5    0.119618
dtype: float64

In [34]:
df['관측값_A_절대값']=s
df

Unnamed: 0,관측값_A,관측값_A_절대값
0,0.579362,0.579362
1,-0.428834,0.428834
2,-1.507811,1.507811
3,1.36902,1.36902
4,-1.053264,1.053264
5,-0.119618,0.119618


## 예제 8-5 pipe 메소드 처리하기

In [35]:
import numpy as np

In [36]:
df = pd.DataFrame({"name": ['김상갑', '임종문', '조현웅'],
                   "program language": [np.nan, 'Python', 'Scala'],
                   "born": [pd.NaT, pd.Timestamp("1966-04-25"),
                             pd.NaT]})
df

Unnamed: 0,name,program language,born
0,김상갑,,NaT
1,임종문,Python,1966-04-25
2,조현웅,Scala,NaT


In [37]:
def name_length(df):
    df['length'] = df.name.str.len()
    return df

판다스에서 pipe 라는 함수로도 사용자정의 함수를 지정할 수 있다.

In [38]:
df.pipe(name_length)

Unnamed: 0,name,program language,born,length
0,김상갑,,NaT,3
1,임종문,Python,1966-04-25,3
2,조현웅,Scala,NaT,3


In [39]:
def born_fillna(df):
    df['born'] = df['born'].fillna(pd.Timestamp('1967-04-25'))
    return df

In [40]:
df.pipe(born_fillna)

Unnamed: 0,name,program language,born,length
0,김상갑,,1967-04-25,3
1,임종문,Python,1966-04-25,3
2,조현웅,Scala,1967-04-25,3


In [41]:
def pl_fillna(df) :
    df['program language'] = df['program language'].fillna("Java")
    return df

In [42]:
df.pipe(pl_fillna)

Unnamed: 0,name,program language,born,length
0,김상갑,Java,1967-04-25,3
1,임종문,Python,1966-04-25,3
2,조현웅,Scala,1967-04-25,3


In [43]:
df1 = pd.DataFrame({"name": ['김상갑', '임종문', '조현웅'],
                   "program language": [np.nan, 'Python', 'Scala'],
                   "born": [pd.NaT, pd.Timestamp("1966-04-25"),
                             pd.NaT]})
df1

Unnamed: 0,name,program language,born
0,김상갑,,NaT
1,임종문,Python,1966-04-25
2,조현웅,Scala,NaT


In [44]:
df1.pipe(name_length).pipe(born_fillna).pipe(pl_fillna)

Unnamed: 0,name,program language,born,length
0,김상갑,Java,1967-04-25,3
1,임종문,Python,1966-04-25,3
2,조현웅,Scala,1967-04-25,3


In [45]:
df2 = pd.DataFrame()
df2['name'] = ['은옥찬', '은석찬', '은옥주']
df2['gender'] = ['Male', 'Male', 'Female']
df2['age'] = [31, 32, 19]
df2

Unnamed: 0,name,gender,age
0,은옥찬,Male,31
1,은석찬,Male,32
2,은옥주,Female,19


In [46]:
def mean_age_by_group(dataframe,col):
    return dataframe.groupby(col).mean()

In [47]:
mean_age_by_group(df2,'gender')

Unnamed: 0_level_0,age
gender,Unnamed: 1_level_1
Female,19.0
Male,31.5


In [48]:
def uppercase_column_name(dataframe):
    dataframe.columns = dataframe.columns.str.upper()
    return dataframe

In [49]:
uppercase_column_name(df2)

Unnamed: 0,NAME,GENDER,AGE
0,은옥찬,Male,31
1,은석찬,Male,32
2,은옥주,Female,19


In [50]:
df3 = pd.DataFrame()
df3['name'] = ['구옥찬', '구석찬', '구옥주']
df3['gender'] = ['Male', 'Male', 'Female']
df3['age'] = [31, 32, 19]
df3

Unnamed: 0,name,gender,age
0,구옥찬,Male,31
1,구석찬,Male,32
2,구옥주,Female,19


In [51]:
(df3.pipe(mean_age_by_group,col = 'gender').pipe(uppercase_column_name))

Unnamed: 0_level_0,AGE
gender,Unnamed: 1_level_1
Female,19.0
Male,31.5
