In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(np.random.randn(5,4),
                 columns=['c1','c2','c3','c4'])
df
df.loc[[0,1],'c1'] = None
df.loc[2,'c2'] = None
df

Unnamed: 0,c1,c2,c3,c4
0,,0.066347,0.31622,-1.806054
1,,-0.048397,2.313371,-0.242331
2,-0.191246,,0.524496,0.517808
3,1.447968,0.929414,1.028958,1.7636
4,0.137752,-1.400955,-1.168055,-1.34204


In [3]:
# 결측값이 있는 행 제거
df.dropna(axis = 0)

Unnamed: 0,c1,c2,c3,c4
3,1.447968,0.929414,1.028958,1.7636
4,0.137752,-1.400955,-1.168055,-1.34204


In [4]:
# 결측값이 있는 열 전체 제거
df.dropna(axis=1)

Unnamed: 0,c3,c4
0,0.31622,-1.806054
1,2.313371,-0.242331
2,0.524496,0.517808
3,1.028958,1.7636
4,-1.168055,-1.34204


In [5]:
# c1열에 대해 결측값이 있는 행을 제거
print(df)
df['c1'].dropna()

         c1        c2        c3        c4
0       NaN  1.627808 -0.760223 -1.337639
1       NaN  1.184806 -2.255430 -1.724712
2  1.272340       NaN  0.485531 -1.257624
3  0.124119  0.695832 -0.435844 -0.622166
4  1.200672  1.034337 -0.038051  0.285884


2    1.272340
3    0.124119
4    1.200672
Name: c1, dtype: float64

In [6]:
df[['c1','c2','c3']].dropna()

Unnamed: 0,c1,c2,c3
3,0.124119,0.695832,-0.435844
4,1.200672,1.034337,-0.038051


In [7]:
df[['c1','c2','c3']].dropna(axis=1) # default는 axis=0

Unnamed: 0,c3
0,-0.760223
1,-2.25543
2,0.485531
3,-0.435844
4,-0.038051


In [None]:
# df에서 2,4번 행에대해 c1,c2,c3열에 대해 na가 있는 행 데이터 제거
# 0.244007 -> 0.092419 -> -1.67884 출력

In [10]:
df.loc[[2,4],['c1','c2','c3']].dropna(axis=0)

Unnamed: 0,c1,c2,c3
4,1.200672,1.034337,-0.038051


In [12]:
# interpolate함수 : 결측값을 보간(선형적으로 값에 비례하는 방식)
# 1 nan nan 7 -> 1 3 5 7

In [5]:
from datetime import datetime
datestr=["09/09/2020","09/11/2020","09/12/2020",'09/18/2020']
datestr

['09/09/2020', '09/11/2020', '09/12/2020', '09/18/2020']

In [6]:
dates = pd.to_datetime(datestr) # to_데이터타입(데이터): 데이터에 대해 데이터타입으로 변경
dates

DatetimeIndex(['2020-09-09', '2020-09-11', '2020-09-12', '2020-09-18'], dtype='datetime64[ns]', freq=None)

In [7]:
ts = pd.Series([1,np.nan,np.nan,10],index=dates)
ts

2020-09-09     1.0
2020-09-11     NaN
2020-09-12     NaN
2020-09-18    10.0
dtype: float64

In [8]:
tsLinear = ts.interpolate() #동일 간격
tsLinear

2020-09-09     1.0
2020-09-11     4.0
2020-09-12     7.0
2020-09-18    10.0
dtype: float64

In [9]:
ts.interpolate(method='values') # 선형적으로 비례하도록

2020-09-09     1.0
2020-09-11     3.0
2020-09-12     4.0
2020-09-18    10.0
dtype: float64

In [10]:
ts.interpolate(method='time')

2020-09-09     1.0
2020-09-11     3.0
2020-09-12     4.0
2020-09-18    10.0
dtype: float64

In [None]:
# 실제로는 knn / 회귀 모델등을 이용하여 결측값을 대체하는것이 일반적
# 결측값 다른값으로 대체

In [11]:
ser = pd.Series([1,2,3,4,np.nan])
ser

0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
dtype: float64

In [12]:
ser.replace(2,10) # 2를 10으로

0     1.0
1    10.0
2     3.0
3     4.0
4     NaN
dtype: float64

In [13]:
ser.replace([1,2,3,4,np.nan], [10,11,12,13,15])

0    10.0
1    11.0
2    12.0
3    13.0
4    15.0
dtype: float64

In [14]:
ser.replace({1:10, 2:7, 3:12, 4:18, np.nan:15})

0    10.0
1     7.0
2    12.0
3    18.0
4    15.0
dtype: float64

In [15]:
# 데이터프레임의 특정 컬럽값 교체
df = pd.DataFrame({'c1':['a_old','b','c','d','e'],
             'c2':[1,2,3,4,5],
             'c3':[6,7,8,9,np.nan]})
df

Unnamed: 0,c1,c2,c3
0,a_old,1,6.0
1,b,2,7.0
2,c,3,8.0
3,d,4,9.0
4,e,5,


In [16]:
df.replace({'c1':'a_old'},{'c1':'a_new'})

Unnamed: 0,c1,c2,c3
0,a_new,1,6.0
1,b,2,7.0
2,c,3,8.0
3,d,4,9.0
4,e,5,


In [17]:
# c3 열의 nan 값 10 으로 변경
df.replace({'c3':np.nan},{'c3':10})

Unnamed: 0,c1,c2,c3
0,a_old,1,6.0
1,b,2,7.0
2,c,3,8.0
3,d,4,9.0
4,e,5,10.0


In [None]:
# 중복값 여부 확인 : duplicated()
# 중복값 처리 (1개만 남기고 나머지 제거) : drop_duplicated()

In [22]:
df = {'key1' : ['a','b','b','c','c'],
     'key2' : ['v','w','w','x','y'],
     'col' : [1,2,3,4,5]}
df
df = pd.DataFrame(df)
df

Unnamed: 0,key1,key2,col
0,a,v,1
1,b,w,2
2,b,w,3
3,c,x,4
4,c,y,5


In [25]:
df.duplicated('key1')
df.duplicated(['key1'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [26]:
df.duplicated(['key1'], keep='first')

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [27]:
df.duplicated(['key1'],keep='last')

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [28]:
df.duplicated(['key1'],keep=False) # 중복데이터는 모두 True

0    False
1     True
2     True
3     True
4     True
dtype: bool

In [29]:
df[df.duplicated(['key1','key2'])] # 중복이 된 데이터 보여줌

Unnamed: 0,key1,key2,col
2,b,w,3


In [34]:
df.drop_duplicates(['key1'],keep=False) # 중복 전부제거

Unnamed: 0,key1,key2,col
0,a,v,1


In [None]:
# unique함수 : 유일값 추출 >>> Array 형태
# value_counts : 유일값 단위로 데이터 갯수 세기

In [35]:
df = pd.DataFrame({'a':['a1','a1','a2','a2','a3','a3'],
                   'b':['b1','b1','b1','b1','b2',np.nan],
                   'c':[1,1,3,4,4,4] })
df

Unnamed: 0,a,b,c
0,a1,b1,1
1,a1,b1,1
2,a2,b1,3
3,a2,b1,4
4,a3,b2,4
5,a3,,4


In [36]:
df['a'].unique()

array(['a1', 'a2', 'a3'], dtype=object)

In [39]:
df['b'].value_counts() # na값 무시
df['b'].value_counts(dropna=True) # default
df['b'].value_counts(dropna=False)

b1     4
NaN    1
b2     1
Name: b, dtype: int64

In [40]:
df['c'].value_counts(normalize=True)
# 유일한 값들의 비율

4    0.500000
1    0.333333
3    0.166667
Name: c, dtype: float64

In [42]:
print(df['c'].value_counts())
print(df['c'].value_counts(ascending=False)) #기본
print(df['c'].value_counts(ascending=True)) 
print(df['c'].value_counts(sort=False)) # 각 데이터별 건 수만 확인

4    3
1    2
3    1
Name: c, dtype: int64
4    3
1    2
3    1
Name: c, dtype: int64
3    1
1    2
4    3
Name: c, dtype: int64
1    2
3    1
4    3
Name: c, dtype: int64


In [43]:
df['b'].value_counts() # nan제외, 디폴트
df['b'].value_counts(dropna=True) # nan제외, 디폴트
df['b'].value_counts(dropna=False)

b1     4
NaN    1
b2     1
Name: b, dtype: int64

In [44]:
df['c'].value_counts(bins=[1,2,3,4]) # bins >> 구간 정보
# () : 개구간, [] : 폐구간

(3.0, 4.0]      3
(0.999, 2.0]    2
(2.0, 3.0]      1
Name: c, dtype: int64

In [45]:
df['c'].value_counts(bins=[1,2,3,4],sort=False) # bins >> 구간 정보
# 정렬 안함

(0.999, 2.0]    2
(2.0, 3.0]      1
(3.0, 4.0]      3
Name: c, dtype: int64

In [46]:
df['c']

0    1
1    1
2    3
3    4
4    4
5    4
Name: c, dtype: int64

In [47]:
out = pd.cut(df['c'], bins=[0,1,2,3,4,5]) # interval이 bins 구간
out

0    (0, 1]
1    (0, 1]
2    (2, 3]
3    (3, 4]
4    (3, 4]
5    (3, 4]
Name: c, dtype: category
Categories (5, interval[int64]): [(0, 1] < (1, 2] < (2, 3] < (3, 4] < (4, 5]]

In [48]:
pd.value_counts(out)

(3, 4]    3
(0, 1]    2
(2, 3]    1
(1, 2]    0
(4, 5]    0
Name: c, dtype: int64