# pandas part 2 

## DataFrame 불러오기

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/data-header.csv')
df.index = ['a','b','c','d']
df

Unnamed: 0,name,age,money,sex
a,둘리,100,100.0,M
b,길동,40,500000.0,M
c,또치,15,50.0,F
d,희동,3,,M


## 행 삭제 

In [3]:
df.drop(['a'])

Unnamed: 0,name,age,money,sex
b,길동,40,500000.0,M
c,또치,15,50.0,F
d,희동,3,,M


In [4]:
df

Unnamed: 0,name,age,money,sex
a,둘리,100,100.0,M
b,길동,40,500000.0,M
c,또치,15,50.0,F
d,희동,3,,M


## `df.index`를 이용한 삭제 

In [5]:
df.columns.values

array(['name', 'age', 'money', 'sex'], dtype=object)

In [6]:
df.index.values

array(['a', 'b', 'c', 'd'], dtype=object)

In [7]:
df.index[0]

'a'

In [8]:
df.drop([df.index[0]])

Unnamed: 0,name,age,money,sex
b,길동,40,500000.0,M
c,또치,15,50.0,F
d,희동,3,,M


In [9]:
df.columns[2]

'money'

In [10]:
df.columns[1:3]

Index(['age', 'money'], dtype='object')

In [11]:
df[df.columns[1:3]]

Unnamed: 0,age,money
a,100,100.0
b,40,500000.0
c,15,50.0
d,3,


In [12]:
df[['name','age','sex']]

Unnamed: 0,name,age,sex
a,둘리,100,M
b,길동,40,M
c,또치,15,F
d,희동,3,M


In [13]:
# df.drop(['money']) error

In [14]:
df.drop(['money'], axis=1)

Unnamed: 0,name,age,sex
a,둘리,100,M
b,길동,40,M
c,또치,15,F
d,희동,3,M


In [15]:
df.drop([df.columns[2]], axis=1)

Unnamed: 0,name,age,sex
a,둘리,100,M
b,길동,40,M
c,또치,15,F
d,희동,3,M


## 컬럼이 많을 경우 특정 컬럼을 제거하고 싶다면 

In [16]:
features = set(df.columns.values) - set(['age', 'money'])

In [17]:
# features is a set
features

{'name', 'sex'}

In [18]:
features = list(features)

In [19]:
df[features]

Unnamed: 0,sex,name
a,M,둘리
b,M,길동
c,F,또치
d,M,희동


## 조건에 의한 삭제 
- 필터링과 같다 

In [20]:
df

Unnamed: 0,name,age,money,sex
a,둘리,100,100.0,M
b,길동,40,500000.0,M
c,또치,15,50.0,F
d,희동,3,,M


In [21]:
df['money'] > 1500

a    False
b     True
c    False
d    False
Name: money, dtype: bool

In [22]:
df[df['money'] > 1500]

Unnamed: 0,name,age,money,sex
b,길동,40,500000.0,M


## 새로운 컬럼 생성 

In [23]:
df['salary'] = 100
df

Unnamed: 0,name,age,money,sex,salary
a,둘리,100,100.0,M,100
b,길동,40,500000.0,M,100
c,또치,15,50.0,F,100
d,희동,3,,M,100


## 조건부 업데이트 

In [24]:
import numpy as np
df

Unnamed: 0,name,age,money,sex,salary
a,둘리,100,100.0,M,100
b,길동,40,500000.0,M,100
c,또치,15,50.0,F,100
d,희동,3,,M,100


In [25]:
df['Young'] = np.where(df['age'] < 20, '젊은이', '노인') 
df

Unnamed: 0,name,age,money,sex,salary,Young
a,둘리,100,100.0,M,100,노인
b,길동,40,500000.0,M,100,노인
c,또치,15,50.0,F,100,젊은이
d,희동,3,,M,100,젊은이


## 리스트를 이용한 업데이트 

In [26]:
df['fname'] = ['공', '고', '김','김']
df

Unnamed: 0,name,age,money,sex,salary,Young,fname
a,둘리,100,100.0,M,100,노인,공
b,길동,40,500000.0,M,100,노인,고
c,또치,15,50.0,F,100,젊은이,김
d,희동,3,,M,100,젊은이,김


In [27]:
df['fullname'] = df['fname'] + ' ' + df['name']
df

Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname
a,둘리,100,100.0,M,100,노인,공,공 둘리
b,길동,40,500000.0,M,100,노인,고,고 길동
c,또치,15,50.0,F,100,젊은이,김,김 또치
d,희동,3,,M,100,젊은이,김,김 희동


## apply 사용법 

In [28]:
import random
random.random()

0.07922336981800882

In [29]:
import random 

def bonus(salary):
    bonus = 0.4 * random.random()
    return int(salary * bonus)

In [30]:
bonus(100)

17

In [31]:
df['bonus'] = df['salary'].apply(bonus)
df

Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname,bonus
a,둘리,100,100.0,M,100,노인,공,공 둘리,30
b,길동,40,500000.0,M,100,노인,고,고 길동,17
c,또치,15,50.0,F,100,젊은이,김,김 또치,31
d,희동,3,,M,100,젊은이,김,김 희동,34


## apply + lambda 

In [32]:
df['salary2019'] = df['salary'].apply(lambda x: x * 1.1)
df

Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname,bonus,salary2019
a,둘리,100,100.0,M,100,노인,공,공 둘리,30,110.0
b,길동,40,500000.0,M,100,노인,고,고 길동,17,110.0
c,또치,15,50.0,F,100,젊은이,김,김 또치,31,110.0
d,희동,3,,M,100,젊은이,김,김 희동,34,110.0


In [33]:
df['salary2019'] = df['salary2019'].apply(int)
df

Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname,bonus,salary2019
a,둘리,100,100.0,M,100,노인,공,공 둘리,30,110
b,길동,40,500000.0,M,100,노인,고,고 길동,17,110
c,또치,15,50.0,F,100,젊은이,김,김 또치,31,110
d,희동,3,,M,100,젊은이,김,김 희동,34,110


## row 추가하기

In [34]:
df2 = pd.DataFrame([{'name': 'snoopy', 'age': 66}], index = ['e'])
df2

Unnamed: 0,age,name
e,66,snoopy


In [35]:
#defrecated
df.append(df2)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Young,age,bonus,fname,fullname,money,name,salary,salary2019,sex
a,노인,100,30.0,공,공 둘리,100.0,둘리,100.0,110.0,M
b,노인,40,17.0,고,고 길동,500000.0,길동,100.0,110.0,M
c,젊은이,15,31.0,김,김 또치,50.0,또치,100.0,110.0,F
d,젊은이,3,34.0,김,김 희동,,희동,100.0,110.0,M
e,,66,,,,,snoopy,,,


In [36]:
df.append(df2, sort=False)

Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname,bonus,salary2019
a,둘리,100,100.0,M,100.0,노인,공,공 둘리,30.0,110.0
b,길동,40,500000.0,M,100.0,노인,고,고 길동,17.0,110.0
c,또치,15,50.0,F,100.0,젊은이,김,김 또치,31.0,110.0
d,희동,3,,M,100.0,젊은이,김,김 희동,34.0,110.0
e,snoopy,66,,,,,,,,


## group by 
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html
- http://pandas.pydata.org/pandas-docs/stable/groupby.html
- 데이터를 나누고(split), 조작하고(apply), 합치는(combining) 작업을 수행

In [37]:
df

Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname,bonus,salary2019
a,둘리,100,100.0,M,100,노인,공,공 둘리,30,110
b,길동,40,500000.0,M,100,노인,고,고 길동,17,110
c,또치,15,50.0,F,100,젊은이,김,김 또치,31,110
d,희동,3,,M,100,젊은이,김,김 희동,34,110


In [38]:
young_group = df.groupby('Young')

In [39]:
young_group

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x11727fc50>

In [40]:
print(young_group.groups)
print(type(young_group.groups))

{'노인': Index(['a', 'b'], dtype='object'), '젊은이': Index(['c', 'd'], dtype='object')}
<class 'dict'>


In [41]:
young_group.groups['노인']

Index(['a', 'b'], dtype='object')

In [42]:
df.loc[young_group.groups['노인']]

Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname,bonus,salary2019
a,둘리,100,100.0,M,100,노인,공,공 둘리,30,110
b,길동,40,500000.0,M,100,노인,고,고 길동,17,110


In [43]:
for key, group in young_group:
    print(key, len(group))
    display(group)
    # print(group)

노인 2


Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname,bonus,salary2019
a,둘리,100,100.0,M,100,노인,공,공 둘리,30,110
b,길동,40,500000.0,M,100,노인,고,고 길동,17,110


젊은이 2


Unnamed: 0,name,age,money,sex,salary,Young,fname,fullname,bonus,salary2019
c,또치,15,50.0,F,100,젊은이,김,김 또치,31,110
d,희동,3,,M,100,젊은이,김,김 희동,34,110


### count 

In [44]:
young_group.size()

Young
노인     2
젊은이    2
dtype: int64

In [45]:
pd.DataFrame({'나이별 통계': young_group.size()})

Unnamed: 0_level_0,나이별 통계
Young,Unnamed: 1_level_1
노인,2
젊은이,2


In [46]:
young_group.mean()

Unnamed: 0_level_0,age,money,salary,bonus,salary2019
Young,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
노인,70.0,250050.0,100.0,23.5,110.0
젊은이,9.0,50.0,100.0,32.5,110.0
