In [1]:
import pandas as pd
import seaborn as sns

In [2]:
# 실행시 컬럼 전체가 표현되도록 
pd.set_option('display.max_columns', None)

# 타이타닉 데이터셋 로드
df_titanic = sns.load_dataset('titanic')
df_titanic.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


# ▶ NaN 파악하기

## 1. info()로 NaN 개수 파악하기

In [4]:
# NaN 확인
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


## 2. value_conts()로 NaN 개수 파악
- NaN 포함하지 않고 개수 파악하기
```python
dropna=False
dropna=""
dropna=0
```

In [7]:
# dropna=False, dropna="", dropna=0
df_titanic['deck'].value_counts(dropna=False)  # NaN 포함하지 않고 개수 파악하라

NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64

## 3. isnull()로 NaN 유무 파악
: column 단위로 계산  
모듈이 컬럼 단위로 계산하는 모듈이라면 axis=0은 column, axis=1은 row를 의미 


```python
isnull().all()   # 각각의 열의 데이터가 모두 null인지를 확인해서 True, False 확인
isnull().any()   # 열에서 하나라도 null이 있으면 True
```

In [12]:
df_titanic.head().isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


### 데이터프레임에서 하나라도 null이 있으면 True 반환 : ```isnull().any()```

In [15]:
df_titanic.head().isnull().any().any()

True

In [16]:
d = pd.DataFrame([[1,2,3],[4,5,6],[None, 7,8]])

In [17]:
d.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,False
2,True,False,False


In [18]:
d.isnull().all()

0    False
1    False
2    False
dtype: bool

In [19]:
d.isnull().any()

0     True
1    False
2    False
dtype: bool

In [20]:
d.isnull().any().any()

True

## 4. 각 열에서 NaN인 데이터의 개수 반환 : ```sum(), count()```
모듈이 컬럼 단위로 계산하는 모듈이라면 axis=0은 column, axis=1은 row를 의미
- ```count()``` : 열의 개수를 카운팅. 옵션에 dropna 등 없음. **conut() 자체는 NaN을 배제하고 카운팅**

In [21]:
df_titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [22]:
df_titanic.isnull().sum(axis=0)

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [23]:
df_titanic.isnull().sum(axis=1)

0      1
1      0
2      1
3      0
4      1
      ..
886    1
887    0
888    2
889    0
890    1
Length: 891, dtype: int64

In [25]:
df_titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

pd.시리즈는 연산 가능.   
ex) 시리즈 + 10

In [26]:
len(df_titanic) - df_titanic.count()  # 행의 개수에서 NaN이 아닌 것을 빼기

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

## 5. NaN이 너무 많다면 : 그 열은 버리는 게 나음
titanic의 경우 deck의 NaN 값이 688/891 이므로 성능에 악영향을 미칠 수 있으므로 해당 컬럼은 삭제하는 게 나음

In [31]:
df_drop = df_titanic.drop(columns=["deck"])
df_drop.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


## 6. NaN이 몇 개 이상인 "열"을 삭제하고 싶을 때
```python
df.dropna(axis=?, thresh=n)
```
- axis : 0=row, 1=column
- thresh : non-null인 데이터가 n개 미만이라면 그 축을 삭제
    - thresh 기준 : NaN이 아닌 값이 최소 몇 개 이상 나와야 함. 그보다 적으면 drop

In [39]:
df_thresh = df_titanic.dropna(axis=1, thresh=500)  # non-null인 데이터가 500개 미만이면 그 축을 삭제
df_thresh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB


## 7. NaN이 있는 "행"을 삭제하고 싶을 때

In [40]:
# age 열에 NaN이 있는 행을 삭제
df_titanic.dropna(subset=['age'], axis=0, how='any')  # how='all'

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## 8. NaN을 값으로 채워넣기
1. mean(axis 옵션) : 0=col, 1=row
2. fillna() : NaN replace value
3. ffill, bfill (Forwarding, Backwarding)

In [45]:
# age열에 있는 NaN 깂을 mean으로 채워넣기
mean_age = df_titanic['age'].mean()
mean_age

29.69911764705882

In [51]:
# fillna
df_fillna = df_titanic['age'].fillna(mean_age)
df_fillna.head(6)

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
Name: age, dtype: float64

In [52]:
# ffill
df_titanic['embark_town'].fillna(method="ffill")

0      Southampton
1        Cherbourg
2      Southampton
3      Southampton
4      Southampton
          ...     
886    Southampton
887    Southampton
888    Southampton
889      Cherbourg
890     Queenstown
Name: embark_town, Length: 891, dtype: object

---
# ▶ 펭귄 데이터로 실습

In [53]:
df_pg = sns.load_dataset("penguins")
df_pg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


## 1) 결측치 데이터 NaN 처리 
- 성별 11개의 데이터가 NaN. 성별은 유추해서 정하기가 어려움

In [57]:
# 결측치가 있는 데이터 확인해보기
df_pg.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

### ▷ True, False로 이루어진 mask 생성

In [62]:
mask = df_pg.isnull().any(axis=1)  # isnull은 열방향 기준이기 때문에 axis=1이 행방향을 의미
mask  # 행 기준으로 결측치가 하나라도 들어있으면 True

0      False
1      False
2      False
3       True
4      False
       ...  
339     True
340    False
341    False
342    False
343    False
Length: 344, dtype: bool

In [65]:
df_pg[mask]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
3,Adelie,Torgersen,,,,,
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,
10,Adelie,Torgersen,37.8,17.1,186.0,3300.0,
11,Adelie,Torgersen,37.8,17.3,180.0,3700.0,
47,Adelie,Dream,37.5,18.9,179.0,2975.0,
246,Gentoo,Biscoe,44.5,14.3,216.0,4100.0,
286,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,
324,Gentoo,Biscoe,47.3,13.8,216.0,4725.0,
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,


In [70]:
# 3, 339번 인덱스(NaN 많은 행들)와 성별 컬럼은 drop
df_pg.dropna(axis=0, how='any')  # 인덱스 지정은 어떻게?

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [71]:
df_pg.drop(columns=["sex"])  # 성별 컬럼 drop
# == df_pg.drop("sex", axis=1)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
3,Adelie,Torgersen,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0
...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0


In [92]:
df_pg.isnull().index

RangeIndex(start=0, stop=344, step=1)

In [94]:
pg_nan = df_pg.drop(index=[3, 339], columns=['sex'])
pg_nan

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0
...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0


In [124]:
df_pg.drop(index=df_pg[df_pg["bill_length_mm"].isnull()].index, 
           columns=['sex']) 

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0
...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0


## 2) 중복 데이터 처리
- ```duplicated()``` 중복 데이터 확인, 위 아래로 비교해서 동일한 값이 있으면 True 반환
    - 한 두건이라면 괜찮지만 중복되는 데이터가 많으면 편향이 일어날 수 있으므로 기본적으로 삭제하는 편.
- ```drop_duplicates()``` : 중복 데이터 삭제하기

In [97]:
pg_nan.duplicated()

0      False
1      False
2      False
4      False
5      False
       ...  
338    False
340    False
341    False
342    False
343    False
Length: 342, dtype: bool

In [105]:
# 중량이 제일 많이 나가는 데이터와 동일한 데이터를 추가해보기
max_weight = pg_nan["body_mass_g"].max()
max_weight

6300.0

In [101]:
pg_nan.tail(2)  # 마지막 인덱스 확인 (마지막에 데이터 추가할거라서.)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0
343,Gentoo,Biscoe,49.9,16.1,213.0,5400.0


In [106]:
pg_nan[pg_nan['body_mass_g']==max_weight]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
237,Gentoo,Biscoe,49.2,15.2,221.0,6300.0


In [115]:
pg_nan.loc[344] = ["Gentoo", "Biscoe", 49.2, 15.2, 221.0, 6300.0]

In [118]:
pg_nan.tail(3)  # 추가한 데이터 확인

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0
343,Gentoo,Biscoe,49.9,16.1,213.0,5400.0
344,Gentoo,Biscoe,49.2,15.2,221.0,6300.0


In [117]:
pg_nan.duplicated()  # 중복된 데이터 True 반환

0      False
1      False
2      False
4      False
5      False
       ...  
340    False
341    False
342    False
343    False
344     True
Length: 343, dtype: bool

In [126]:
# 중복 데이터 삭제하기 : drop_duplicates() + inplace 옵션
pg_nan = pg_nan.drop_duplicates()
pg_nan

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0
...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0


In [121]:
d = pd.DataFrame([[1,2,3],[1,2,3],[4,5,6],[1,2,3]])  

In [122]:
d.duplicated()  # 연속되지 않아도 반복되는 값이 있으면 True 반환

0    False
1     True
2    False
3     True
dtype: bool

## 3) 이상치 데이터 처리
- ```z-score``` : ( 데이터 - 평균 ) / 표준편차  : 평균으로부터 얼마나 떨어져 있는지를 표준편차를 이용해 설명하는 척도
- ```np.mean(), np.std()```  : 표준편차

In [125]:
import numpy as np

In [127]:
# z-score가 -2 이하, 2 이상 되는 데이터 (DataFrame)을 반환하는 함수
def outlier(df, col, z_threshold):
    '''
    예를 들어 z_score가 1.67dlaus 1.67 표준편차만큼 평균값에서 멀어져 있단 의미
    -2 이하, +2 이상 데이터는  삭제 = z_threshold 값
    '''
    z_score = abs((df[col] - np.mean(df[col]))) / np.std(df[col])  # abs(절댓값)
    return df[z_score > z_threshold].index

In [129]:
def inlier(df, col, z_threshold):
    # outlier는 구해서 또 drop 해줘야 하니까
    z_score = abs((df[col] - np.mean(df[col]))) / np.std(df[col])  # abs(절댓값)
    return df[z_score <= z_threshold].index

In [128]:
pg_nan.loc[outlier(pg_nan, 'body_mass_g', 2)]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
233,Gentoo,Biscoe,48.4,14.6,213.0,5850.0
235,Gentoo,Biscoe,49.3,15.7,217.0,5850.0
237,Gentoo,Biscoe,49.2,15.2,221.0,6300.0
253,Gentoo,Biscoe,59.6,17.0,230.0,6050.0
297,Gentoo,Biscoe,51.1,16.3,220.0,6000.0
299,Gentoo,Biscoe,45.2,16.4,223.0,5950.0
331,Gentoo,Biscoe,49.8,15.9,229.0,5950.0
335,Gentoo,Biscoe,55.1,16.0,230.0,5850.0
337,Gentoo,Biscoe,48.8,16.2,222.0,6000.0
