# **데이터 전처리_이상치 처리**

## **이상치**
- 데이터의 표본 공간에서 발생하는 outlier, 데이터 범위를 벗어난 값
- 통계적으로 Q1-1.5IQR, Q3+1.5IQR을 뜻함
- IQR : 3분위수 - 1분위수. 사분위수 범위를 말함

## **이상치 처리**
- 이상치 탐지 및 접근 : IQR에 대한 이해
- 이상치 행 제거, 열 제거
- 이상치 단순 대체
- 통계량을 활용한 이상치 대체

## **Data : CAR_CRASHES 내장 데이터**
### **데이터 설명**
- total: Number of drivers involved in fatal collisions per billion miles (5.900–23.900)
- speeding: Percentage Of Drivers Involved In Fatal Collisions Who Were Speeding (1.792–
9.450)
- alcohol: Percentage Of Drivers Involved In Fatal Collisions Who Were Alcohol-Impaired
(1.593–10.038)
- not_distracted: Percentage Of Drivers Involved In Fatal Collisions Who Were Not
Distracted (1.760–23.661)
- no_previous: Percentage Of Drivers Involved In Fatal Collisions Who Had Not Been
Involved In Any Previous Accidents (5.900–21.280)
- ins_premium: Car Insurance Premiums (641.960–1301.520)
- ins_losses: Losses incurred by insurance companies for collisions per insured driver
(82.75–194.780)


In [1]:
!pip install seaborn



In [2]:
from seaborn import load_dataset
import numpy as np
import pandas as pd

In [3]:
car_df = load_dataset('car_crashes')
car_df.head()

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA


In [4]:
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   total           51 non-null     float64
 1   speeding        51 non-null     float64
 2   alcohol         51 non-null     float64
 3   not_distracted  51 non-null     float64
 4   no_previous     51 non-null     float64
 5   ins_premium     51 non-null     float64
 6   ins_losses      51 non-null     float64
 7   abbrev          51 non-null     object 
dtypes: float64(7), object(1)
memory usage: 3.3+ KB


## **분위수 구하기**

In [5]:
q1 = np.quantile(car_df['alcohol'],0.25) #제 1분위수
q3 = np.quantile(car_df['alcohol'],0.75) #제 3분위수
print(q1, q3)

3.894 5.603999999999999


In [6]:
# IQR계산하기
iqr = q3 - q1
print(iqr)

1.709999999999999


## **이상치 cut off 정의하기**

In [9]:
btom_cut_off = q1 - 1.5*iqr
top_cut_off = q3 + 1.5*iqr
print("<소수점 처리 전>")
print("이상치 기준1 (하단) :", btom_cut_off, "\n이상치 기준2 (상단) :", top_cut_off)
print("---------------------------------------------------")
print("<소수점 처리 후>")
btom_cut_off = "{:.4f}".format(btom_cut_off) #소수점 자리 설정
top_cut_off = "{:.4f}".format(top_cut_off)
print("이상치 기준1 (하단) :", btom_cut_off, "\n이상치 기준2 (상단) :", top_cut_off)

<소수점 처리 전>
이상치 기준1 (하단) : 1.3290000000000015 
이상치 기준2 (상단) : 8.168999999999997
---------------------------------------------------
<소수점 처리 후>
이상치 기준1 (하단) : 1.3290 
이상치 기준2 (상단) : 8.1690


In [10]:
(car_df['alcohol']>=8.1690)|(car_df['alcohol']<=1.3290)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26     True
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34     True
35    False
36    False
37    False
38    False
39    False
40     True
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
Name: alcohol, dtype: bool

In [12]:
car_df['alcohol'][26] #얼마나 이상치인지 수치 확인하기 

9.416

## **이상치 조회**

In [13]:
car_out = car_df[(car_df['alcohol']>=8.1690)|(car_df['alcohol']<=1.3290)]

In [14]:
car_not_outlier = car_df[(car_df['alcohol']<=8.1690)&(car_df['alcohol']>=1.3290)]

In [15]:
car_not_outlier.head(20)

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA
5,13.6,5.032,3.808,10.744,12.92,835.5,139.91,CO
6,10.8,4.968,3.888,9.396,8.856,1068.73,167.02,CT
7,16.2,6.156,4.86,14.094,16.038,1137.87,151.48,DE
8,5.9,2.006,1.593,5.9,5.9,1273.89,136.05,DC
9,17.9,3.759,5.191,16.468,16.826,1160.13,144.18,FL


In [16]:
car_out

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
26,21.4,8.346,9.416,17.976,18.19,816.21,85.15,MT
34,23.9,5.497,10.038,23.661,20.554,688.75,109.72,ND
40,23.9,9.082,9.799,22.944,19.359,858.97,116.29,SC


In [18]:
car_out.reset_index(drop = True, inplace = True) # 인덱스 초기화
car_out

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,21.4,8.346,9.416,17.976,18.19,816.21,85.15,MT
1,23.9,5.497,10.038,23.661,20.554,688.75,109.72,ND
2,23.9,9.082,9.799,22.944,19.359,858.97,116.29,SC


In [20]:
car_not_outlier.reset_index(drop = True, inplace = True)
car_not_outlier

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA
5,13.6,5.032,3.808,10.744,12.92,835.5,139.91,CO
6,10.8,4.968,3.888,9.396,8.856,1068.73,167.02,CT
7,16.2,6.156,4.86,14.094,16.038,1137.87,151.48,DE
8,5.9,2.006,1.593,5.9,5.9,1273.89,136.05,DC
9,17.9,3.759,5.191,16.468,16.826,1160.13,144.18,FL


## **이상치 인덱스를 추출해서 값을 대체하는 방법**

In [24]:
outlier_index = car_df[(car_df['alcohol']>=8.1690)|(car_df['alcohol']<=1.3290)].index

In [25]:
outlier_index

Index([26, 34, 40], dtype='int64')

In [26]:
car_df.drop(outlier_index).head(30)

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA
5,13.6,5.032,3.808,10.744,12.92,835.5,139.91,CO
6,10.8,4.968,3.888,9.396,8.856,1068.73,167.02,CT
7,16.2,6.156,4.86,14.094,16.038,1137.87,151.48,DE
8,5.9,2.006,1.593,5.9,5.9,1273.89,136.05,DC
9,17.9,3.759,5.191,16.468,16.826,1160.13,144.18,FL


In [27]:
car_df.loc[outlier_index,'alcohol'] = 100
car_df.head(30)

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA
5,13.6,5.032,3.808,10.744,12.92,835.5,139.91,CO
6,10.8,4.968,3.888,9.396,8.856,1068.73,167.02,CT
7,16.2,6.156,4.86,14.094,16.038,1137.87,151.48,DE
8,5.9,2.006,1.593,5.9,5.9,1273.89,136.05,DC
9,17.9,3.759,5.191,16.468,16.826,1160.13,144.18,FL


In [28]:
print(np.median(car_df['alcohol']))
car_df.loc[outlier_index, 'alcohol'] = np.median(car_df['alcohol'])

4.554


In [29]:
car_df

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ
3,22.4,4.032,5.824,21.056,21.28,827.34,142.39,AR
4,12.0,4.2,3.36,10.92,10.68,878.41,165.63,CA
5,13.6,5.032,3.808,10.744,12.92,835.5,139.91,CO
6,10.8,4.968,3.888,9.396,8.856,1068.73,167.02,CT
7,16.2,6.156,4.86,14.094,16.038,1137.87,151.48,DE
8,5.9,2.006,1.593,5.9,5.9,1273.89,136.05,DC
9,17.9,3.759,5.191,16.468,16.826,1160.13,144.18,FL
