# DataFrame 정제
: NaN으로 빠지 값이나 정상적이지 않은값(결측치(missing value),이상치)의 정제

In [122]:
import numpy as np
import pandas as pd

In [123]:
# numpy의 난수를 이용하여 DataFrame 만들기
df =pd.DataFrame(np.random.rand(6,4))
df

Unnamed: 0,0,1,2,3
0,0.790265,0.27516,0.285163,0.17805
1,0.902344,0.938709,0.364931,0.522571
2,0.504315,0.378631,0.007609,0.922931
3,0.143369,0.128286,0.135531,0.445941
4,0.974649,0.179835,0.224839,0.33941
5,0.491764,0.98737,0.64665,0.843043


In [124]:
# Column과 index이름 넣기
df.columns = ["A", "B", "C", "D"]
df.index = pd.date_range("20220701", periods=6) # periods 날짜
df

Unnamed: 0,A,B,C,D
2022-07-01,0.790265,0.27516,0.285163,0.17805
2022-07-02,0.902344,0.938709,0.364931,0.522571
2022-07-03,0.504315,0.378631,0.007609,0.922931
2022-07-04,0.143369,0.128286,0.135531,0.445941
2022-07-05,0.974649,0.179835,0.224839,0.33941
2022-07-06,0.491764,0.98737,0.64665,0.843043


In [125]:
# index type 확인
df.index

DatetimeIndex(['2022-07-01', '2022-07-02', '2022-07-03', '2022-07-04',
               '2022-07-05', '2022-07-06'],
              dtype='datetime64[ns]', freq='D')

In [126]:
# 새로운 F열 생성과 값 입력
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2022-07-01,0.790265,0.27516,0.285163,0.17805,1.0
2022-07-02,0.902344,0.938709,0.364931,0.522571,
2022-07-03,0.504315,0.378631,0.007609,0.922931,3.5
2022-07-04,0.143369,0.128286,0.135531,0.445941,6.1
2022-07-05,0.974649,0.179835,0.224839,0.33941,
2022-07-06,0.491764,0.98737,0.64665,0.843043,7.0


In [127]:
# NaN이 하나라도 있는 Data행 삭제
df.dropna()# NaN 값이 있으면 처리
df.dropna(how='any')# how='any' NaN이 하나라고 있으면 행이 지워진다.

Unnamed: 0,A,B,C,D,F
2022-07-01,0.790265,0.27516,0.285163,0.17805,1.0
2022-07-03,0.504315,0.378631,0.007609,0.922931,3.5
2022-07-04,0.143369,0.128286,0.135531,0.445941,6.1
2022-07-06,0.491764,0.98737,0.64665,0.843043,7.0


In [128]:
# NaN이 모든 열에 Data로 있는 경우의 행 삭제 ex) NaN,NaN,NaN,NaN,NaN,NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2022-07-01,0.790265,0.27516,0.285163,0.17805,1.0
2022-07-02,0.902344,0.938709,0.364931,0.522571,
2022-07-03,0.504315,0.378631,0.007609,0.922931,3.5
2022-07-04,0.143369,0.128286,0.135531,0.445941,6.1
2022-07-05,0.974649,0.179835,0.224839,0.33941,
2022-07-06,0.491764,0.98737,0.64665,0.843043,7.0


In [129]:
# Nan값을 특정 값으로 변경
df.fillna(value=5.0)

Unnamed: 0,A,B,C,D,F
2022-07-01,0.790265,0.27516,0.285163,0.17805,1.0
2022-07-02,0.902344,0.938709,0.364931,0.522571,5.0
2022-07-03,0.504315,0.378631,0.007609,0.922931,3.5
2022-07-04,0.143369,0.128286,0.135531,0.445941,6.1
2022-07-05,0.974649,0.179835,0.224839,0.33941,5.0
2022-07-06,0.491764,0.98737,0.64665,0.843043,7.0


In [130]:
# Boolean Index를 통해 검색 및 변경
df.isnull()

Unnamed: 0,A,B,C,D,F
2022-07-01,False,False,False,False,False
2022-07-02,False,False,False,False,True
2022-07-03,False,False,False,False,False
2022-07-04,False,False,False,False,False
2022-07-05,False,False,False,False,True
2022-07-06,False,False,False,False,False


In [131]:
# F열에서 NaN을 포함하고 있는 행 찾기, 무언가를 찾을때는 loc,iloc를 바로 생각해야한다.
df.loc[df.F.isnull(),:]
df.loc[df.isnull()['F'],:]

Unnamed: 0,A,B,C,D,F
2022-07-02,0.902344,0.938709,0.364931,0.522571,
2022-07-05,0.974649,0.179835,0.224839,0.33941,


In [132]:
# index를 이용하여 행 제거
df.drop(pd.to_datetime("20220701"))

Unnamed: 0,A,B,C,D,F
2022-07-02,0.902344,0.938709,0.364931,0.522571,
2022-07-03,0.504315,0.378631,0.007609,0.922931,3.5
2022-07-04,0.143369,0.128286,0.135531,0.445941,6.1
2022-07-05,0.974649,0.179835,0.224839,0.33941,
2022-07-06,0.491764,0.98737,0.64665,0.843043,7.0


In [133]:
df.drop([pd.to_datetime("20220701"), pd.to_datetime("20220702")])
df.drop(["2022-07-01","2022-07-02"])


Unnamed: 0,A,B,C,D,F
2022-07-03,0.504315,0.378631,0.007609,0.922931,3.5
2022-07-04,0.143369,0.128286,0.135531,0.445941,6.1
2022-07-05,0.974649,0.179835,0.224839,0.33941,
2022-07-06,0.491764,0.98737,0.64665,0.843043,7.0


In [134]:
# # 열 삭제
del df["F"]
#df.drop('F', axis="columns")

In [135]:
#df.drop(["B","F"], axis = 1)

In [136]:
df

Unnamed: 0,A,B,C,D
2022-07-01,0.790265,0.27516,0.285163,0.17805
2022-07-02,0.902344,0.938709,0.364931,0.522571
2022-07-03,0.504315,0.378631,0.007609,0.922931
2022-07-04,0.143369,0.128286,0.135531,0.445941
2022-07-05,0.974649,0.179835,0.224839,0.33941
2022-07-06,0.491764,0.98737,0.64665,0.843043


In [137]:
data = [[1.40, np.nan],
        [7.10, -4.5, ],
        [np.nan, np.nan],
        [0.75, -1.3]]
        
df = pd.DataFrame(data,index=['a',"b",'c','d'],columns=["one",'two']
)
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [138]:
# 열방향 합계구하기
df.sum(axis='rows')

one    9.25
two   -5.80
dtype: float64

In [139]:
# 행방향 합계 구하기
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [140]:
df.iloc[1,:].sum()
#df.loc['b'].sum()
# loc,iloc는 행#

2.5999999999999996

In [141]:
#df['one'].sum() # loc,iloc는 열

In [142]:
# 평균
df.mean(axis='rows')

one    3.083333
two   -2.900000
dtype: float64

In [143]:
# 군산
df.var(axis='rows')

one    12.205833
two     5.120000
dtype: float64

In [144]:
df.mean(axis="columns")

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [145]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [146]:
# NaN값이 있으면 계산하지 못하게 하기
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [154]:
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.083333,-4.5
d,0.75,-1.3


In [155]:
# one의 NaN은 남은 값들의 평균으로 대체, two의 NaN은 가장 작은값으로 대체
one_mean = df.mean(axis=0)['one']
two_min = df.min(axis=0)['two']

In [157]:
df['one'] =df['one'].fillna(value=one_mean)
df['two'] =df['two'].fillna(value=two_min)
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.1,-4.5
c,3.083333,-4.5
d,0.75,-1.3


---
# DataFrame Merging(병합)

In [158]:
df1 = pd.DataFrame({"key": list("bbacaab"), "data1":range(7)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [160]:
df2 = pd.DataFrame({"key" : list('abd'),"data2" : range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [161]:
# 병합하기
pd.merge(df1,df2, on="key")

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


> key c와 d는 서로 match되지 않으므로 출력되지 않음

In [163]:
# 모두 보이기
pd.merge(df1, df2, on='key', how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [164]:
# df1 기준으로 합치기
pd.merge(df1, df2, on='key', how ='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [165]:
# df2 기준으로 합치기
pd.merge(df1, df2, on='key', how ='right')

Unnamed: 0,key,data1,data2
0,a,2.0,0
1,a,4.0,0
2,a,5.0,0
3,b,0.0,1
4,b,1.0,1
5,b,6.0,1
6,d,,2


### 두개의 dataframe에 중복된 값이 있을 경우

In [167]:
df1 =pd.DataFrame({'key':list('bbacab'), 'data1' :range(6)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [169]:
df2 =pd.DataFrame({'key':list('ababd'), 'data2' :range(5)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [170]:
pd.merge(df1, df2, on ='key', how = 'inner')

Unnamed: 0,key,data2_x,data2_y
0,a,0,0
1,a,0,2
2,a,2,0
3,a,2,2
4,b,1,1
5,b,1,3
6,b,3,1
7,b,3,3
8,d,4,4


In [171]:
# dataframe의 key값이 다를때
df1 =pd.DataFrame({'lkey':list('bbacab'), 'data1' :range(6)})
df2 =pd.DataFrame({'rkey':list('ababd'), 'data2' :range(5)})

In [173]:
pd.merge(df1, df2, left_on='lkey',right_on='rkey', how ='inner')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,0,b,3
2,b,1,b,1
3,b,1,b,3
4,b,5,b,1
5,b,5,b,3
6,a,2,a,0
7,a,2,a,2
8,a,4,a,0
9,a,4,a,2


The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


---
### 하나의 key값으로 병합하는 경우

In [175]:
df1 =pd.DataFrame({'key':list('bbacab'), 'data1' :range(6)}) # 키값으로 쓸거임
df2 =pd.DataFrame({'group_val':[3.5,7]}, index=['a','b']) # 인덱스값으로 쓸거임

In [176]:
pd.merge(df1, df2, left_on='key', right_index=True)

Unnamed: 0,key,data1,group_val
0,b,0,7.0
1,b,1,7.0
5,b,5,7.0
2,a,2,3.5
4,a,4,3.5


---
### Data Concatenating(연결)

In [178]:
s1 = pd.Series([0,1], index=['a','b'])
s1

a    0
b    1
dtype: int64

In [179]:
s2 =pd.Series([2,3,4], index=['c','d','e'])
s2

c    2
d    3
e    4
dtype: int64

In [180]:
s3=pd.Series([5,6], index=['f','g'])
s3

f    5
g    6
dtype: int64

In [181]:
# s1, s2, s3 합치기
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [182]:
# Series를 합치면서 DataFrame 만들기
pd.concat([s1,s2,s3], axis=1, sort =True) # sort =True : 정렬

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [183]:
# column에 이름 넣기
pd.concat([s1,s2,s3], axis=1, sort =True, keys=['s1','s2','s3'])

Unnamed: 0,s1,s2,s3
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


---
### DataFrame의 Concatenation

In [185]:
np.arange(6)

array([0, 1, 2, 3, 4, 5])

In [189]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a','b','c'], columns =['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [190]:
df2 = pd.DataFrame(np.arange(4).reshape(2, 2), index=['a','c'], columns =['three','four'])
df2

Unnamed: 0,three,four
a,0,1
c,2,3


In [191]:
# df1과 df2를 합치기
pd.concat([df1,df2], axis=1, sort =True )

Unnamed: 0,one,two,three,four
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [192]:
# ignore_index=True : 인덱스 없애버리기
pd.concat([df1,df2], axis=1, sort =True, ignore_index=True ) 

Unnamed: 0,0,1,2,3
a,0,1,0.0,1.0
b,2,3,,
c,4,5,2.0,3.0


In [193]:
#0번 기준으로 합치기
pd.concat([df1,df2], axis=0, sort =True,ignore_index=True )

Unnamed: 0,four,one,three,two
0,,0.0,,1.0
1,,2.0,,3.0
2,,4.0,,5.0
3,1.0,,0.0,
4,3.0,,2.0,


---
### 데이터프레임 중복값 제거

In [196]:
df = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[1,1,2,3,3,4,4]})
df

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [198]:
# 중복값 확인
df.duplicated()
#전체 행에 대한 F:중복아님 T:중복임 위에 데이터보고 확인하기 첫번째는 앞에 아무것도 없으니 중복이 아님

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [199]:
# 중복값 제거 : 전체 행에 대한
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [200]:
# 새로운 열 추가
df['v1'] = np.arange(7)
df

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [201]:
df.drop_duplicates()

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [202]:
# 컬럼하나의 중복값 제거
df['k1'].drop_duplicates()

0    one
3    two
Name: k1, dtype: object

In [203]:
# k1의 값들로 중복값 제거
df.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [204]:
# k1의 값들로 마지막 값으로 출력
df.drop_duplicates(['k1'], keep='last')

Unnamed: 0,k1,k2,v1
2,one,2,2
6,two,4,6


In [205]:
### Category 사용하기

In [206]:
df3 =pd.DataFrame({id:[1,2,3,4,5,6],
                    'raw_grade':['a','b','b','a','a','e']
})
df3

Unnamed: 0,<built-in function id>,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [207]:
# category 자료형으로 변환하기
df3['grade'] = df3['raw_grade'].astype('category')
df3

Unnamed: 0,<built-in function id>,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [209]:
df3['raw_grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: raw_grade, dtype: object

In [210]:
df3['grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [211]:
# category를 이용하여 자료 변형
df3['grade'].cat.categories = ['very good', 'good', 'very bad']
df3

Unnamed: 0,<built-in function id>,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [213]:
df3.sort_values(by='grade') # by='grade' :기준점

Unnamed: 0,<built-in function id>,raw_grade,grade
0,1,a,very good
3,4,a,very good
4,5,a,very good
1,2,b,good
2,3,b,good
5,6,e,very bad


In [214]:
df3['grade']

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): ['very good', 'good', 'very bad']

---
### 데이터 범위 정하기

In [216]:
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100] # 범주이다.18~25,25~35,35~60,60~100 4단계로 되어있다.

In [217]:
cats = pd.cut(ages, bins) # bins기준으로 ages를 쪼갠다.
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [218]:
# 범주 관련 code 보기
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [220]:
# 범주 관련 빈도수 보기
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [221]:
# category 이름 정하기
group_names = ['Youth', 'youngAdult', 'MiddleAged', 'Senior']
cat2 = pd.cut(ages, bins, labels=group_names)
cat2.value_counts()

Youth         5
youngAdult    3
MiddleAged    3
Senior        1
dtype: int64