### json 문서
- 웹 형식 문서 표현 방법 중 하나.  
- 가볍고 속도가 빠름. 
- 딕셔너리 구조

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json

In [14]:
# 이건 문자열 (json형식의 문자열)
obj="""
{"name": "Wes",
"places_lived":["United States", "Spain", "Germany"],
"pet":null,
"siblings":[{"name":"Kim", "age":25, "pets":["ba", "ka"]},
            {"name":"Lee", "age":22, "pets":["aa", "bb", "cc"]}]
}
"""
# json에서는 key, value 모두 " " 로 묶어야 한다. ' '는 에러
print(obj)


{"name": "Wes",
"places_lived":["United States", "Spain", "Germany"],
"pet":null,
"siblings":[{"name":"Kim", "age":25, "pets":["ba", "ka"]},
            {"name":"Lee", "age":22, "pets":["aa", "bb", "cc"]}]
}



In [15]:
#json 문자열을 파이썬 형태로 변환
res=json.loads(obj) 
res

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Kim', 'age': 25, 'pets': ['ba', 'ka']},
  {'name': 'Lee', 'age': 22, 'pets': ['aa', 'bb', 'cc']}]}

In [17]:
#파이썬 형태로 읽어진 객체를 진짜 json형식으로 변환
asjson=json.dumps(res) 
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Kim", "age": 25, "pets": ["ba", "ka"]}, {"name": "Lee", "age": 22, "pets": ["aa", "bb", "cc"]}]}'

In [18]:
res['siblings']

[{'name': 'Kim', 'age': 25, 'pets': ['ba', 'ka']},
 {'name': 'Lee', 'age': 22, 'pets': ['aa', 'bb', 'cc']}]

In [28]:
# 데이터프레임 안에 리스트 출력
pd.DataFrame(res['siblings'])

Unnamed: 0,name,age,pets
0,Kim,25,"[ba, ka]"
1,Lee,22,"[aa, bb, cc]"


In [30]:
#원하는 컬럼만 얻고 싶은경우
df=pd.DataFrame(res['siblings'], columns=['name', 'age']) 
df

Unnamed: 0,name,age
0,Kim,25
1,Lee,22


In [31]:
# DataFrame을 json 문서 형식으로 변환
df.to_json()

'{"name":{"0":"Kim","1":"Lee"},"age":{"0":25,"1":22}}'

In [32]:
#json문서 저장
df.to_json("myjson.json") 

In [33]:
#json 문서 불러오기
pd.read_json("myjson.json")

Unnamed: 0,name,age
0,Kim,25
1,Lee,22


In [35]:
#데이터 정제
stringData=pd.Series(['aaa','bbb',np.nan,'ccc'])
stringData

0    aaa
1    bbb
2    NaN
3    ccc
dtype: object

In [36]:
stringData.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [37]:
stringData[stringData.isnull()]

2    NaN
dtype: object

In [38]:
stringData[0]=None #None값은 NA와 같음

In [40]:
stringData.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### NA 처리 메서드 : 
- dropna: 누락된 데이터가 있는 축(행,열)을 제외
- fillna: 누락 데이터를 대신하는 값으로 채움 or ffill, bfill
- isnull: 누락 데이터를 추출
- notnull: isnull에 반대되는 데이터를 추출


In [45]:
from numpy import nan as NA
data=pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [46]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [48]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [60]:
data=pd.DataFrame([[1, 6, 3],
                 [1, NA, NA],
                 [NA, NA, NA],
                 [NA, 5, 2]])
data

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
2,,,
3,,5.0,2.0


In [61]:
data.dropna(axis=0)

Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [62]:
data.dropna(axis=1)

0
1
2
3


In [63]:
data.dropna() # default: axis=0

Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [64]:
data.dropna(how="all") #전채 행이 NaN 일경우 제거

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
3,,5.0,2.0


In [70]:
#중복제거
data=pd.DataFrame({'a':['one','two']*3+['two'],
                  'b':[1,1,2,3,3,4,4]})
data.info()
data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       7 non-null      object
 1   b       7 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 240.0+ bytes


Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [71]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [73]:
# 전체 컬럼에 대한 중복값을 제외
data.drop_duplicates() #duplicated 함수 결과가 False인 데이터프레임을 리턴

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [74]:
data['v1']=range(7)
data

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [79]:
#특정 컬럼에 대한 중복값을 제외
data.drop_duplicates(['a'])

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1


In [80]:
#특정 컬럼들에 대한 중복값을 제외
data.drop_duplicates(['a' ,'b'])

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [81]:
#특정 컬럼들에 대한 중복값을 제외
data.drop_duplicates(['a' ,'b'], keep='last')

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [None]:
#데이터치환 : map함수, replace함수(딕셔너리)

In [90]:
ages=[-3, 15,20,25,28,30,20,22,37,61,44,46,33, 111]
bins=[0,10,20,30,40,60,100]
# 연령(...여러)데이터 -> 연령대로 나눔 -> 연령대별로 카테고리화

In [91]:
res=pd.cut(ages, bins) #구간을 벗어나는 경우에는 NA가 나온다.
res #구간 : 초과~이하

[NaN, (10.0, 20.0], (10.0, 20.0], (20.0, 30.0], (20.0, 30.0], ..., (60.0, 100.0], (40.0, 60.0], (40.0, 60.0], (30.0, 40.0], NaN]
Length: 14
Categories (6, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] < (40, 60] < (60, 100]]

In [92]:
res.codes # -1은 nan을 의마한다

array([-1,  1,  1,  2,  2,  2,  1,  2,  3,  5,  4,  4,  3, -1], dtype=int8)

In [93]:
res.categories

IntervalIndex([(0, 10], (10, 20], (20, 30], (30, 40], (40, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [94]:
res.value_counts()
# pd.value_counts(res)

(0, 10]      0
(10, 20]     3
(20, 30]     4
(30, 40]     2
(40, 60]     2
(60, 100]    1
dtype: int64

In [101]:
ages=[15,20,25,28,30,20,22,37,61,44,46,33]
pd.cut(ages, [15,26,36,61,100], right=False) #15이상 16미만 
#구간: 이상~미만

[[15, 26), [15, 26), [15, 26), [26, 36), [26, 36), ..., [36, 61), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[15, 26) < [26, 36) < [36, 61) < [61, 100)]

In [102]:
gn=['youth','youngyouth','middleaged','senior']
pd.cut(ages, [15,26,36,61,100], labels=gn)

[NaN, youth, youth, youngyouth, youngyouth, ..., middleaged, middleaged, middleaged, middleaged, youngyouth]
Length: 12
Categories (4, object): [youth < youngyouth < middleaged < senior]

In [104]:
res=pd.qcut(ages,4)
res

[(14.999, 21.5], (14.999, 21.5], (21.5, 29.0], (21.5, 29.0], (29.0, 38.75], ..., (29.0, 38.75], (38.75, 61.0], (38.75, 61.0], (38.75, 61.0], (29.0, 38.75]]
Length: 12
Categories (4, interval[float64]): [(14.999, 21.5] < (21.5, 29.0] < (29.0, 38.75] < (38.75, 61.0]]

In [105]:
res.value_counts()

(14.999, 21.5]    3
(21.5, 29.0]      3
(29.0, 38.75]     3
(38.75, 61.0]     3
dtype: int64

그룹별 집계: groupby() 그룹단위로 집계 (요약)
- 전체데이터 -> 그룹별로 분할 -> 각 그룹별로 집계(요약)함수 적용 -> 각 그룹별 집계 결과들 -> 합침

http://archive.ics.uci.edu/ml/datasets/Abalone

In [None]:
Sex / nominal / -- / M, F, and I (infant) 
Length / continuous / mm / Longest shell measurement 
Diameter / continuous / mm / perpendicular to length 
Height / continuous / mm / with meat in shell 
Whole weight / continuous / grams / whole abalone 
Shucked weight / continuous / grams / weight of meat 
Viscera weight / continuous / grams / gut weight (after bleeding) 
Shell weight / continuous / grams / after being dried 
Rings / integer / -- / +1.5 gives the age in years 

In [121]:
abalone=pd.read_csv("abalone.txt", header=None, sep=",", 
            names=['Sex','Length','Diameter','Height','Whole weight','Shucked weight',
                   'Viscera weight','Shell weight','Rings'])
abalone

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [124]:
np.sum(abalone.isnull())

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64

In [127]:
# 전복 성별 그룹별 전체 무게 변수에 대해 집계
grouped=abalone['Whole weight'].groupby(abalone['Sex']) 
#abalone 데이터의 sex에 따른 그룹화

In [128]:
grouped.size()
# abalone의 'Sex' 컬럼 그룹화 -> 각 그룹별 Whole_weight 컬럼값의 size()호출 결과

Sex
F    1307
I    1342
M    1528
Name: Whole weight, dtype: int64

In [130]:
grouped.sum()
# abalone의 'Sex' 컬럼 그룹화 -> 각 그룹별 Whole_weight 컬럼값의 sum()호출 결과
grouped.mean()
# abalone의 'Sex' 컬럼 그룹화 -> 각 그룹별 Whole_weight 컬럼값의 mean()호출 결과

Sex
F    1.046532
I    0.431363
M    0.991459
Name: Whole weight, dtype: float64

In [136]:
abalone.groupby(abalone['Sex']).sum()
abalone.groupby('Sex').sum()

abalone.groupby(abalone['Sex']).mean()
abalone.groupby('Sex').mean()

Unnamed: 0_level_0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


In [138]:
abalone.Length
# 범주형(2가지) 변수 추가 : length 값이 length열의 중앙값보다 크면 length_long, 그렇지 않으면 length_short
# length    length_med
# 0.455     length_short
# 0.350     length_short
# 0.95      length_long

0       0.455
1       0.350
2       0.530
3       0.440
4       0.330
        ...  
4172    0.565
4173    0.590
4174    0.600
4175    0.625
4176    0.710
Name: Length, Length: 4177, dtype: float64

In [145]:
abalone.Length.median() #0.545

#내 답
# abalone['Length_label']=pd.cut(abalone.Length, [0,abalone.Length.median(),1], labels=['length_short', 'length_long'])

# 이동재
# abalone['length_med'] = (abalone.length>abalone.length.median()).map({True:'length_long',False:'length_short'})

In [146]:
abalone

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Length_label
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,length_short
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short
...,...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,length_long
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long


In [149]:
abalone["Length_med"] = np.where(abalone["Length"] > abalone["Length"].median(), "length_long", "length_short")

abalone[["Length", "Length_med"]]

Unnamed: 0,Length,Length_med
0,0.455,length_short
1,0.350,length_short
2,0.530,length_short
3,0.440,length_short
4,0.330,length_short
...,...,...
4172,0.565,length_long
4173,0.590,length_long
4174,0.600,length_long
4175,0.625,length_long


In [155]:
mean_weight=abalone['Whole weight'].groupby([abalone['Sex'], abalone['Length_med']]).mean()
mean_weight

Sex  Length_med  
F    length_long     1.261330
     length_short    0.589702
I    length_long     0.923215
     length_short    0.351234
M    length_long     1.255182
     length_short    0.538157
Name: Whole weight, dtype: float64

In [158]:
mean_weight.unstack()

Length_med,length_long,length_short
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.26133,0.589702
I,0.923215,0.351234
M,1.255182,0.538157


In [160]:
# 그룹별로 특정 작업을 반복
# abalone 성별로 그룹화 -> for loop -> 그룹별 데이터셋을 출력

In [161]:
abalone[['Sex','Length_med','Whole weight', 'Rings']]

Unnamed: 0,Sex,Length_med,Whole weight,Rings
0,M,length_short,0.5140,15
1,M,length_short,0.2255,7
2,F,length_short,0.6770,9
3,M,length_short,0.5160,10
4,I,length_short,0.2050,7
...,...,...,...,...
4172,F,length_long,0.8870,11
4173,M,length_long,0.9660,10
4174,M,length_long,1.1760,9
4175,F,length_long,1.0945,10


In [162]:
abalone[['Sex','Length_med','Whole weight', 'Rings']].groupby('Sex')
# 그룹화 객체를 for문으로 반복하면, 그룹 이름(M, F, I)과 그룹별 데이터를 리턴할 수 있음

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ffb0763f040>

In [164]:
# for 변수들 in abalone[['Sex','Length_med','Whole weight', 'Rings']].groupby('Sex'):
for sex, group_data in abalone[['Sex','Length_med','Whole weight', 'Rings']].groupby('Sex'):
    print(sex), print(group_data[:5])

F
   Sex    Length_med  Whole weight  Rings
2    F  length_short        0.6770      9
6    F  length_short        0.7775     20
7    F  length_short        0.7680     16
9    F   length_long        0.8945     19
10   F  length_short        0.6065     14
I
   Sex    Length_med  Whole weight  Rings
4    I  length_short        0.2050      7
5    I  length_short        0.3515      8
16   I  length_short        0.2905      7
21   I  length_short        0.2255     10
42   I  length_short        0.0700      5
M
   Sex    Length_med  Whole weight  Rings
0    M  length_short        0.5140     15
1    M  length_short        0.2255      7
3    M  length_short        0.5160     10
8    M  length_short        0.5095      9
11   M  length_short        0.4060     10


In [172]:
# 그룹 (Sex, Length_med 조합)별 데이터셋 출력
for (sex, Length_med), group_data in abalone[['Sex','Length_med','Whole weight', 'Rings']].groupby(['Sex', 'Length_med']):
    print(sex, Length_med), print(group_data[:5])

F length_long
   Sex   Length_med  Whole weight  Rings
9    F  length_long        0.8945     19
22   F  length_long        0.9395     12
23   F  length_long        0.7635      9
24   F  length_long        1.1615     10
25   F  length_long        0.9285     11
F length_short
   Sex    Length_med  Whole weight  Rings
2    F  length_short        0.6770      9
6    F  length_short        0.7775     20
7    F  length_short        0.7680     16
10   F  length_short        0.6065     14
13   F  length_short        0.6845     10
I length_long
    Sex   Length_med  Whole weight  Rings
509   I  length_long        0.8735     16
510   I  length_long        1.1095     10
549   I  length_long        0.8750     11
550   I  length_long        1.1625     17
551   I  length_long        0.9885     13
I length_short
   Sex    Length_med  Whole weight  Rings
4    I  length_short        0.2050      7
5    I  length_short        0.3515      8
16   I  length_short        0.2905      7
21   I  length_short    

In [180]:
#딕셔너리 형태
# {키: 값, 키:값, 키:값}
# {'F': F그룹 데이터셋, 'M': M그룹 데이터셋, 'I': I그룹 데이터셋}
list(abalone[:10][['Sex','Length_med','Whole weight', 'Rings']].groupby('Sex'))
# 성별로 그룹화 -> 성별 그룹을 key로 설정 -> 데이터셋은 value로 설정
aba_group=dict(list(abalone[:10][['Sex','Length_med','Whole weight', 'Rings']].groupby('Sex')))
aba_group

{'F':   Sex    Length_med  Whole weight  Rings
 2   F  length_short        0.6770      9
 6   F  length_short        0.7775     20
 7   F  length_short        0.7680     16
 9   F   length_long        0.8945     19,
 'I':   Sex    Length_med  Whole weight  Rings
 4   I  length_short        0.2050      7
 5   I  length_short        0.3515      8,
 'M':   Sex    Length_med  Whole weight  Rings
 0   M  length_short        0.5140     15
 1   M  length_short        0.2255      7
 3   M  length_short        0.5160     10
 8   M  length_short        0.5095      9}

In [181]:
aba_group
# 그룹이름을 가지고 데이터셋을 인덱싱
aba_group['M']

Unnamed: 0,Sex,Length_med,Whole weight,Rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9


In [187]:
# abalone의 상위 10개 데이터에 대해 'Sex'이 'M'인 자료 추출(불린참조)
abalone[:10][abalone['Sex']=='M'][['Sex','Length_med','Whole weight', 'Rings']]

  abalone[:10][abalone['Sex']=='M'][['Sex','Length_med','Whole weight', 'Rings']]


Unnamed: 0,Sex,Length_med,Whole weight,Rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9


In [189]:
# 특정 문자열을 매핑 규칙에 따른 변환 -> dict.get()
# ex) Lee, lee, LEE => lee
# ex) Choi, choi, Cho, CHO,... -> others

df=pd.DataFrame({'name':['kim','KIM','Kim','lee','LEE','Lee','cho','choi'],
             'value1':[1,2,3,4,5,6,7,8],
             'vlaue2':[100,200,300,100,200,100,300,500]})
df

Unnamed: 0,name,value1,vlaue2
0,kim,1,100
1,KIM,2,200
2,Kim,3,300
3,lee,4,100
4,LEE,5,200
5,Lee,6,100
6,cho,7,300
7,choi,8,500


In [202]:
# kim, lee, others 로 분류
nameMapping={
    "KIM":"kim",
    "Kim":"kim",
    "LEE":"lee",
    "Lee":"lee",
    "cho":"others",
    "choi":"others",
    "woo":"others"
}

In [205]:
# np.char.lower("KIM")
func=lambda x:nameMapping.get(x) #이경우에는 mapping규칙에 정의되지 않은 키가 전달되면 None이 리턴된다.
df.name.map(func)

0      None
1       kim
2       kim
3      None
4       lee
5       lee
6    others
7    others
Name: name, dtype: object

In [210]:
func=lambda x:nameMapping.get(x, "etc") # mapping 규칙에 정의되지 않은 값이와도 기존값 리턴
df.name.map(func)

0       etc
1       kim
2       kim
3       etc
4       lee
5       lee
6    others
7    others
Name: name, dtype: object

In [211]:
func=lambda x:nameMapping.get(x, x) # mapping 규칙에 정의되지 않은 값이와도 기존값 리턴
df.name.map(func)

0       kim
1       kim
2       kim
3       lee
4       lee
5       lee
6    others
7    others
Name: name, dtype: object

In [212]:
df['name2']=df.name.map(func)

In [214]:
df
# name2 컬럼값을 그룹화 -> 그룹별 합계
df.groupby('name2').sum()

Unnamed: 0_level_0,value1,vlaue2
name2,Unnamed: 1_level_1,Unnamed: 2_level_1
kim,6,600
lee,15,400
others,15,800


In [217]:
df.groupby(['name2','name']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,vlaue2
name2,name,Unnamed: 2_level_1,Unnamed: 3_level_1
kim,KIM,2,200
kim,Kim,3,300
kim,kim,1,100
lee,LEE,5,200
lee,Lee,6,100
lee,lee,4,100
others,cho,7,300
others,choi,8,500


In [218]:
df=pd.DataFrame({'id':[1,2,10,20,100,200],
                'name':['aa','aa2','aa3','aa4','aa5','aa6']})
df

Unnamed: 0,id,name
0,1,aa
1,2,aa2
2,10,aa3
3,20,aa4
4,100,aa5
5,200,aa6


In [235]:
# 퀴즈2
# df에 id2열을 추가
# id2 열은 id열 값을 5자리로 만들었을때, 앞에 빈자리를 0으로 채움

# df['id2'] = df.id.map(lambda x : '0'*(5-len(str(x)))+str(x)) #이동재님
# df["id2"] = df["id"].apply(lambda x : "%05d" % x) #이지윤님
df['id2'] = df['id'].astype(str).apply(lambda x: x.zfill(5)) #나님
#선생님
df['id'].apply(lambda x: "{:0>5d}".format(x)) #이렇게 하면 문자열로 바꾸는작업 생략 가능

0    00001
1    00002
2    00010
3    00020
4    00100
5    00200
Name: id, dtype: object

In [234]:
df

Unnamed: 0,id,name,id2
0,1,aa,1
1,2,aa2,2
2,10,aa3,10
3,20,aa4,20
4,100,aa5,100
5,200,aa6,200


### https://mkaz.blog/code/python-string-format-cookbook/

abalone #4177 건
- 7:3 비율로 train set/ test set 데이터를 분할 (random state = 20201005)
- train set -> 모델 -> test set -> 나이를 예측 (rings) => rmsle적용 => 점수 출력
- random forest regressor 이용