# Part 6. 데이터 프레임의 다양한 응용

### 1. 함수 매핑

##### <예제 6-1> 시리즈의 원소에 apply() 적용

In [2]:
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'fare']]
df['ten'] = 10
print(df.head(3))

    age     fare  ten
0  22.0   7.2500   10
1  38.0  71.2833   10
2  26.0   7.9250   10


In [5]:
def add_10(n):
    return n + 10

def add_two_obj(a, b):
    return a + b

print(add_10(10))
print(add_two_obj(10, 10))

20
20


In [14]:
print(df['age'].apply(add_10).head(3))

0    32.0
1    48.0
2    36.0
Name: age, dtype: float64


##### <예제 6-2> 데이터프레임 원소에 applymap() 적용

In [13]:
df = titanic.loc[:, ['age', 'fare']]
print(df.head(3))

    age     fare
0  22.0   7.2500
1  38.0  71.2833
2  26.0   7.9250


In [12]:
print(df.applymap(add_10).head(3))

    age     fare
0  32.0  17.2500
1  48.0  81.2833
2  36.0  17.9250


##### <예제 6-3> 데이터프레임에 apply(axis=0) 적용

In [16]:
def missing_value(series):
    return series.isnull()

print(df.apply(missing_value, axis=0).head(3))

     age   fare
0  False  False
1  False  False
2  False  False


##### <예제 6-4> 데이터프레임에 apply(axis=0) 적용

In [24]:
def min_max(x):
    return x.max() - x.min()

print(df.apply(min_max))

age      79.5800
fare    512.3292
dtype: float64


##### <예제 6-5> 데이터프레임에 apply(axis=1) 적용

In [30]:
def add_two_obj(a, b):
    return a + b

print(df.apply(lambda x: add_two_obj(x['age'], 10), axis=1).head(3))

0    32.0
1    48.0
2    36.0
dtype: float64


##### <예제 6-6> 데이터프레임에 pipe() 적용

In [31]:
def missing_value(x):
    return x.isnull()

def missing_count(x):
    return missing_value(x).sum()

def total_number_missing(x):
    return missing_count(x).sum()

In [33]:
result_df = df.pipe(missing_value)
result_series = df.pipe(missing_count)
result_value = df.pipe(total_number_missing)

In [34]:
print(result_df.head(3))
print(result_series)
print(result_value)

     age   fare
0  False  False
1  False  False
2  False  False
age     177
fare      0
dtype: int64
177


### 2. 열 재구성

##### <예제 6-7> 열 순서 바꾸기

In [1]:
import seaborn as sns

In [3]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:, 'survived':'age']

In [8]:
print(df.head(3))

   survived  pclass     sex   age
0         0       3    male  22.0
1         1       1  female  38.0
2         1       3  female  26.0


In [6]:
columns = list(df.columns.values)
print(columns)

['survived', 'pclass', 'sex', 'age']


In [7]:
columns_sorted = sorted(columns)
df_sorted = df[columns_sorted]
print(df_sorted.head(3))

    age  pclass     sex  survived
0  22.0       3    male         0
1  38.0       1  female         1
2  26.0       3  female         1


In [13]:
columns_reversed = list(reversed(columns))
df_reversed = df[columns_reversed]
print(df_reversed.head(3))

    age     sex  pclass  survived
0  22.0    male       3         0
1  38.0  female       1         1
2  26.0  female       3         1


##### <예제 6-8> 열 분리하기

In [14]:
import pandas as pd

In [31]:
stock = pd.read_excel('part6/주가데이터.xlsx')
df = stock[['연월일', '거래량']]
print(df.head(3))
print(df.dtypes)

         연월일     거래량
0 2018-07-02  137977
1 2018-06-29  170253
2 2018-06-28  155769
연월일    datetime64[ns]
거래량             int64
dtype: object


In [33]:
df['연월일'] = df['연월일'].astype('str')

In [34]:
dates = df['연월일'].str.split('-')
print(dates.head(3))

0    [2018, 07, 02]
1    [2018, 06, 29]
2    [2018, 06, 28]
Name: 연월일, dtype: object


In [35]:
df['연'] = dates.str.get(0)
df['월'] = dates.str.get(1)
df['일'] = dates.str.get(2)
print(df.head(3))

          연월일     거래량     연   월   일
0  2018-07-02  137977  2018  07  02
1  2018-06-29  170253  2018  06  29
2  2018-06-28  155769  2018  06  28


### 3. 필터링

##### <예제 6-9> 불린 인덱싱

In [41]:
mask1 = titanic['sibsp'] == 3
mask2 = titanic['sibsp'] == 4
mask3 = titanic['sibsp'] == 5
df_boolean = titanic.loc[mask1 | mask2 | mask3]
print(df_boolean.head(3))

    survived  pclass     sex  age  sibsp  parch    fare embarked  class  \
7          0       3    male  2.0      3      1  21.075        S  Third   
16         0       3    male  2.0      4      1  29.125        Q  Third   
24         0       3  female  8.0      3      1  21.075        S  Third   

      who  adult_male deck  embark_town alive  alone  
7   child       False  NaN  Southampton    no  False  
16  child       False  NaN   Queenstown    no  False  
24  child       False  NaN  Southampton    no  False  


##### <예제 6-10> isin() 필터링

In [42]:
isin_filter = titanic['sibsp'].isin([3, 4, 5])
df_isin = titanic[isin_filter]
print(df_isin.head(3))

    survived  pclass     sex  age  sibsp  parch    fare embarked  class  \
7          0       3    male  2.0      3      1  21.075        S  Third   
16         0       3    male  2.0      4      1  29.125        Q  Third   
24         0       3  female  8.0      3      1  21.075        S  Third   

      who  adult_male deck  embark_town alive  alone  
7   child       False  NaN  Southampton    no  False  
16  child       False  NaN   Queenstown    no  False  
24  child       False  NaN  Southampton    no  False  


### 4. 데이터프레임 합치기

##### <예제 6-11> 데이터프레임 연결(concatenate)

In [44]:
df1 = pd.DataFrame({'a':['a0', 'a1', 'a2', 'a3'],
                    'b':['b0', 'b1', 'b2', 'b3'],
                    'c':['c0', 'c1', 'c2', 'c3']},
                    index=[0, 1, 2, 3])
df2 = pd.DataFrame({'a':['a2', 'a3', 'a4', 'a5'],
                    'b':['b2', 'b3', 'b4', 'b5'],
                    'c':['c2', 'c3', 'c4', 'c5'],
                    'd':['d2', 'd3', 'd4', 'd5']},
                    index=[2, 3, 4, 5])
print(df1, df2, sep='\n\n')

    a   b   c
0  a0  b0  c0
1  a1  b1  c1
2  a2  b2  c2
3  a3  b3  c3

    a   b   c   d
2  a2  b2  c2  d2
3  a3  b3  c3  d3
4  a4  b4  c4  d4
5  a5  b5  c5  d5


In [45]:
result1 = pd.concat([df1, df2])
print(result1)

    a   b   c    d
0  a0  b0  c0  NaN
1  a1  b1  c1  NaN
2  a2  b2  c2  NaN
3  a3  b3  c3  NaN
2  a2  b2  c2   d2
3  a3  b3  c3   d3
4  a4  b4  c4   d4
5  a5  b5  c5   d5


In [46]:
result2 = pd.concat([df1, df2], axis=1)
print(result2)

     a    b    c    a    b    c    d
0   a0   b0   c0  NaN  NaN  NaN  NaN
1   a1   b1   c1  NaN  NaN  NaN  NaN
2   a2   b2   c2   a2   b2   c2   d2
3   a3   b3   c3   a3   b3   c3   d3
4  NaN  NaN  NaN   a4   b4   c4   d4
5  NaN  NaN  NaN   a5   b5   c5   d5


In [47]:
result3 = pd.concat([df1, df2], axis=1, join='inner')
print(result3)

    a   b   c   a   b   c   d
2  a2  b2  c2  a2  b2  c2  d2
3  a3  b3  c3  a3  b3  c3  d3


In [57]:
sr = pd.Series(['e0', 'e1', 'e2'], name='e', index=[3, 4, 5])
pd.concat([df2, sr], axis=1, sort=True)

Unnamed: 0,a,b,c,d,e
2,a2,b2,c2,d2,
3,a3,b3,c3,d3,e0
4,a4,b4,c4,d4,e1
5,a5,b5,c5,d5,e2


In [58]:
pd.concat([df2, sr], axis=1, sort=False)

Unnamed: 0,a,b,c,d,e
2,a2,b2,c2,d2,
3,a3,b3,c3,d3,e0
4,a4,b4,c4,d4,e1
5,a5,b5,c5,d5,e2


##### <예제 6-12> 데이터프레임 합치기(merge)

In [59]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.unicode.east_asian_width', True)

In [60]:
df1 = pd.read_excel('part6/stock price.xlsx')
df2 = pd.read_excel('part6/stock valuation.xlsx')

In [61]:
print(df1)
print('\n')
print(df2)

       id    stock_name          value   price
0  128940      한미약품   59385.666667  421000
1  130960        CJ E&M   58540.666667   98900
2  138250    엔에스쇼핑   14558.666667   13200
3  139480        이마트  239230.833333  254500
4  142280  녹십자엠에스     468.833333   10200
5  145990        삼양사   82750.000000   82000
6  185750        종근당   40293.666667  100500
7  192400    쿠쿠홀딩스  179204.666667  177500
8  199800          툴젠   -2514.333333  115400
9  204210  모두투어리츠    3093.333333    3475


       id              name           eps     bps        per       pbr
0  130960            CJ E&M   6301.333333   54068  15.695091  1.829178
1  136480              하림    274.166667    3551  11.489362  0.887074
2  138040    메리츠금융지주   2122.333333   14894   6.313806  0.899691
3  139480            이마트  18268.166667  295780  13.931338  0.860437
4  145990            삼양사   5741.000000  108090  14.283226  0.758627
5  161390        한국타이어   5648.500000   51341   7.453306  0.820007
6  181710   NHN엔터테인먼트   2110.166667   784

In [62]:
merge_inner = pd.merge(df1, df2)
print(merge_inner)

       id    stock_name          value   price          name           eps  \
0  130960        CJ E&M   58540.666667   98900        CJ E&M   6301.333333   
1  139480        이마트  239230.833333  254500        이마트  18268.166667   
2  145990        삼양사   82750.000000   82000        삼양사   5741.000000   
3  185750        종근당   40293.666667  100500        종근당   3990.333333   
4  204210  모두투어리츠    3093.333333    3475  모두투어리츠     85.166667   

      bps        per       pbr  
0   54068  15.695091  1.829178  
1  295780  13.931338  0.860437  
2  108090  14.283226  0.758627  
3   40684  25.185866  2.470259  
4    5335  40.802348  0.651359  


In [74]:
merge_left = pd.merge(df1, df2, how='inner', left_on=['id', 'stock_name'], right_on=['id', 'name'])
print(merge_left)

       id    stock_name          value   price          name           eps  \
0  130960        CJ E&M   58540.666667   98900        CJ E&M   6301.333333   
1  139480        이마트  239230.833333  254500        이마트  18268.166667   
2  145990        삼양사   82750.000000   82000        삼양사   5741.000000   
3  185750        종근당   40293.666667  100500        종근당   3990.333333   
4  204210  모두투어리츠    3093.333333    3475  모두투어리츠     85.166667   

      bps        per       pbr  
0   54068  15.695091  1.829178  
1  295780  13.931338  0.860437  
2  108090  14.283226  0.758627  
3   40684  25.185866  2.470259  
4    5335  40.802348  0.651359  


In [68]:
df2

Unnamed: 0,id,name,eps,bps,per,pbr
0,130960,CJ E&M,6301.333333,54068,15.695091,1.829178
1,136480,하림,274.166667,3551,11.489362,0.887074
2,138040,메리츠금융지주,2122.333333,14894,6.313806,0.899691
3,139480,이마트,18268.166667,295780,13.931338,0.860437
4,145990,삼양사,5741.0,108090,14.283226,0.758627
5,161390,한국타이어,5648.5,51341,7.453306,0.820007
6,181710,NHN엔터테인먼트,2110.166667,78434,30.755864,0.827447
7,185750,종근당,3990.333333,40684,25.185866,2.470259
8,204210,모두투어리츠,85.166667,5335,40.802348,0.651359
9,207940,삼성바이오로직스,4644.166667,60099,89.790059,6.938551
