In [1]:
# 필요한 라이브러리 import
import warnings
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.core.interactiveshell import InteractiveShell

# 경고 메시지 무시
warnings.filterwarnings('ignore')

# 모든 출력 표시
InteractiveShell.ast_node_interactivity = "all"

# 마이너스 기호 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

# 한글 폰트 설정 (Mac)
mpl.rcParams['font.family'] = "AppleGothic" # 윈도우는 'Malgun Gothic' 사용

In [2]:
mpg = sns.load_dataset("mpg")
mpg

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [3]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [4]:
mpg[mpg.isna().any(axis=1)]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
32,25.0,4,98.0,,2046,19.0,71,usa,ford pinto
126,21.0,6,200.0,,2875,17.0,74,usa,ford maverick
330,40.9,4,85.0,,1835,17.3,80,europe,renault lecar deluxe
336,23.6,4,140.0,,2905,14.3,80,usa,ford mustang cobra
354,34.5,4,100.0,,2320,15.8,81,europe,renault 18i
374,23.0,4,151.0,,3035,20.5,82,usa,amc concord dl


In [5]:
mpg["horsepower"] = mpg.groupby("cylinders")["horsepower"].transform(
    lambda x: x.fillna(x.median())
)

mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


### 변수 유형 변환 : 변수 유형은 왜 고민해야 할까요?

In [6]:
mpg["origin"] = mpg["origin"].astype("category")
mpg["name"]   = mpg["name"].astype("category")

mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float64 
 1   cylinders     398 non-null    int64   
 2   displacement  398 non-null    float64 
 3   horsepower    398 non-null    float64 
 4   weight        398 non-null    int64   
 5   acceleration  398 non-null    float64 
 6   model_year    398 non-null    int64   
 7   origin        398 non-null    category
 8   name          398 non-null    category
dtypes: category(2), float64(4), int64(3)
memory usage: 33.7 KB


### 범주형 데이터 범주 보기

In [7]:
mpg["origin"].value_counts()
mpg["name"].value_counts(normalize=True)

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

name
ford pinto              0.015075
ford maverick           0.012563
amc matador             0.012563
toyota corolla          0.012563
chevrolet impala        0.010050
                          ...   
dodge st. regis         0.002513
fiat 124 sport coupe    0.002513
fiat 124 tc             0.002513
fiat 124b               0.002513
vw rabbit custom        0.002513
Name: proportion, Length: 305, dtype: float64

### 데이터 전처리

#### 행추출

In [8]:
# 4기통만 추출
mpg[mpg["cylinders"] == 4]

# 4기통과 6기통 추출
mpg[(mpg["cylinders"] == 4) | (mpg["cylinders"] == 6)]

# isin 함수 사용
mpg[mpg["cylinders"].isin([4, 6])]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
14,24.0,4,113.0,95.0,2372,15.0,70,japan,toyota corona mark ii
18,27.0,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
19,26.0,4,97.0,46.0,1835,20.5,70,europe,volkswagen 1131 deluxe sedan
20,25.0,4,110.0,87.0,2672,17.5,70,europe,peugeot 504
21,24.0,4,107.0,90.0,2430,14.5,70,europe,audi 100 ls
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
14,24.0,4,113.0,95.0,2372,15.0,70,japan,toyota corona mark ii
15,22.0,6,198.0,95.0,2833,15.5,70,usa,plymouth duster
16,18.0,6,199.0,97.0,2774,15.5,70,usa,amc hornet
17,21.0,6,200.0,85.0,2587,16.0,70,usa,ford maverick
18,27.0,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
14,24.0,4,113.0,95.0,2372,15.0,70,japan,toyota corona mark ii
15,22.0,6,198.0,95.0,2833,15.5,70,usa,plymouth duster
16,18.0,6,199.0,97.0,2774,15.5,70,usa,amc hornet
17,21.0,6,200.0,85.0,2587,16.0,70,usa,ford maverick
18,27.0,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [9]:
# 연비 평균보다 높은 행 추출
mpg["mpg"].mean()
mpg[mpg["mpg"] > mpg["mpg"].mean()]

# 그룹별 연비 평균보다 높은 행 추출
mpg.groupby("cylinders")["mpg"].mean()
mpg[mpg["mpg"] > mpg.groupby("cylinders")["mpg"].transform("mean")]

np.float64(23.514572864321607)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
14,24.0,4,113.0,95.0,2372,15.0,70,japan,toyota corona mark ii
18,27.0,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
19,26.0,4,97.0,46.0,1835,20.5,70,europe,volkswagen 1131 deluxe sedan
20,25.0,4,110.0,87.0,2672,17.5,70,europe,peugeot 504
21,24.0,4,107.0,90.0,2430,14.5,70,europe,audi 100 ls
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


cylinders
3    20.550000
4    29.286765
5    27.366667
6    19.985714
8    14.963107
Name: mpg, dtype: float64

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
390,32.0,4,144.0,96.0,2665,13.9,82,japan,toyota celica gt
391,36.0,4,135.0,84.0,2370,13.0,82,usa,dodge charger 2.2
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage


### groupby 예제

In [10]:
# 특정 컬럼만
mpg.groupby("cylinders")["mpg"].mean()

# 여러 컬럼
mpg.groupby("cylinders")[["mpg", "horsepower"]].mean()

# 여러 통계량
mpg.groupby("cylinders")["mpg"].agg(["mean", "median", "std"])

cylinders
3    20.550000
4    29.286765
5    27.366667
6    19.985714
8    14.963107
Name: mpg, dtype: float64

Unnamed: 0_level_0,mpg,horsepower
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1
3,20.55,99.25
4,29.286765,78.27451
5,27.366667,82.333333
6,19.985714,101.488095
8,14.963107,158.300971


Unnamed: 0_level_0,mean,median,std
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,20.55,20.25,2.564501
4,29.286765,28.25,5.710156
5,27.366667,25.4,8.228204
6,19.985714,19.0,3.807322
8,14.963107,14.0,2.836284


### 열추출

In [11]:
# 변수명 사용
mpg[["mpg", "cylinders"]]

# 수치형 변수만 선택
mpg.select_dtypes(include = 'number')
mpg.select_dtypes(include = 'int64')

# 정규 표현식 사용
mpg.filter(regex = "^o")    # "o"로 시작하는 변수
mpg.filter(regex = "t$")    # "t"로 끝나는 변수
mpg.filter(regex = "_")     # "_"가 포함된 변수

Unnamed: 0,mpg,cylinders
0,18.0,8
1,15.0,8
2,18.0,8
3,16.0,8
4,17.0,8
...,...,...
393,27.0,4
394,44.0,4
395,32.0,4
396,28.0,4


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70
...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82
394,44.0,4,97.0,52.0,2130,24.6,82
395,32.0,4,135.0,84.0,2295,11.6,82
396,28.0,4,120.0,79.0,2625,18.6,82


Unnamed: 0,cylinders,weight,model_year
0,8,3504,70
1,8,3693,70
2,8,3436,70
3,8,3433,70
4,8,3449,70
...,...,...,...
393,4,2790,82
394,4,2130,82
395,4,2295,82
396,4,2625,82


Unnamed: 0,origin
0,usa
1,usa
2,usa
3,usa
4,usa
...,...
393,usa
394,europe
395,usa
396,usa


Unnamed: 0,displacement,weight
0,307.0,3504
1,350.0,3693
2,318.0,3436
3,304.0,3433
4,302.0,3449
...,...,...
393,140.0,2790
394,97.0,2130
395,135.0,2295
396,120.0,2625


Unnamed: 0,model_year
0,70
1,70
2,70
3,70
4,70
...,...
393,82
394,82
395,82
396,82


### 정렬

In [12]:
mpg.sort_values("cylinders", ascending = False)
mpg.sort_values(["cylinders", "mpg"], ascending = [True, False])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
86,14.0,8,304.0,150.0,3672,11.5,73,usa,amc matador
230,15.5,8,350.0,170.0,4165,11.4,77,usa,chevrolet monte carlo landau
231,15.5,8,400.0,190.0,4325,12.2,77,usa,chrysler cordoba
232,16.0,8,351.0,149.0,4335,14.5,77,usa,ford thunderbird
...,...,...,...,...,...,...,...,...,...
81,28.0,4,97.0,92.0,2288,17.0,72,japan,datsun 510 (sw)
243,21.5,3,80.0,110.0,2720,13.5,77,japan,mazda rx-4
334,23.7,3,70.0,100.0,2420,12.5,80,japan,mazda rx-7 gs
71,19.0,3,70.0,97.0,2330,13.5,72,japan,mazda rx2 coupe


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
334,23.7,3,70.0,100.0,2420,12.5,80,japan,mazda rx-7 gs
243,21.5,3,80.0,110.0,2720,13.5,77,japan,mazda rx-4
71,19.0,3,70.0,97.0,2330,13.5,72,japan,mazda rx2 coupe
111,18.0,3,70.0,90.0,2124,13.5,73,japan,maxda rx3
322,46.6,4,86.0,65.0,2110,17.9,80,japan,mazda glc
...,...,...,...,...,...,...,...,...,...
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala
124,11.0,8,350.0,180.0,3664,11.0,73,usa,oldsmobile omega
25,10.0,8,360.0,215.0,4615,14.0,70,usa,ford f250
26,10.0,8,307.0,200.0,4376,15.0,70,usa,chevy c20


### 변수 생성

In [13]:
mpg["kml"] = (mpg["mpg"] * 0.425).round(2)
mpg

mpg.sort_values(["cylinders", "kml"], ascending=[True, False])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kml
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,7.65
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,6.38
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,7.65
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,6.80
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,7.22
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl,11.48
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup,18.70
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage,13.60
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger,11.90


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kml
334,23.7,3,70.0,100.0,2420,12.5,80,japan,mazda rx-7 gs,10.07
243,21.5,3,80.0,110.0,2720,13.5,77,japan,mazda rx-4,9.14
71,19.0,3,70.0,97.0,2330,13.5,72,japan,mazda rx2 coupe,8.07
111,18.0,3,70.0,90.0,2124,13.5,73,japan,maxda rx3,7.65
322,46.6,4,86.0,65.0,2110,17.9,80,japan,mazda glc,19.80
...,...,...,...,...,...,...,...,...,...,...
103,11.0,8,400.0,150.0,4997,14.0,73,usa,chevrolet impala,4.68
124,11.0,8,350.0,180.0,3664,11.0,73,usa,oldsmobile omega,4.68
25,10.0,8,360.0,215.0,4615,14.0,70,usa,ford f250,4.25
26,10.0,8,307.0,200.0,4376,15.0,70,usa,chevy c20,4.25


### 그룹별 요약

In [14]:
# 단변수 요약
mpg.groupby("cylinders")["kml"].mean().reset_index()

# 2개 변수 선택 요약
mpg.groupby(["cylinders", "origin"])["kml"].mean().reset_index()

# 수치형 데이터 모두 요약
num_var = mpg.select_dtypes(include = "number")
num_var.groupby('cylinders').mean()

# 여러 통계량 한번에
mpg.groupby('cylinders')['mpg'].agg(['count', 'mean', 'median', 'min', 'max', 'std'])

Unnamed: 0,cylinders,kml
0,3,8.7325
1,4,12.446716
2,5,11.633333
3,6,8.492976
4,8,6.358932


Unnamed: 0,cylinders,origin,kml
0,3,europe,
1,3,japan,8.7325
2,3,usa,
3,4,europe,12.074127
4,4,japan,13.428116
5,4,usa,11.832222
6,5,europe,11.633333
7,5,japan,
8,5,usa,
9,6,europe,8.54


Unnamed: 0_level_0,mpg,displacement,horsepower,weight,acceleration,model_year,kml
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,20.55,72.5,99.25,2398.5,13.25,75.5,8.7325
4,29.286765,109.796569,78.27451,2308.127451,16.601471,77.073529,12.446716
5,27.366667,145.0,82.333333,3103.333333,18.633333,79.0,11.633333
6,19.985714,218.142857,101.488095,3198.22619,16.263095,75.928571,8.492976
8,14.963107,345.009709,158.300971,4114.718447,12.95534,73.902913,6.358932


Unnamed: 0_level_0,count,mean,median,min,max,std
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,4,20.55,20.25,18.0,23.7,2.564501
4,204,29.286765,28.25,18.0,46.6,5.710156
5,3,27.366667,25.4,20.3,36.4,8.228204
6,84,19.985714,19.0,15.0,38.0,3.807322
8,103,14.963107,14.0,9.0,26.6,2.836284


In [15]:
# 사용자 정의 함수
mpg.groupby('cylinders')['mpg'].agg(['mean', ('range', lambda x: x.max() - x.min())])

# 백분위수
mpg.groupby('cylinders')['mpg'].agg(['mean', ('Q1', lambda x: x.quantile(0.25)), ('Q3', lambda x: x.quantile(0.75))])

Unnamed: 0_level_0,mean,range
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1
3,20.55,5.7
4,29.286765,28.6
5,27.366667,16.1
6,19.985714,23.0
8,14.963107,17.6


Unnamed: 0_level_0,mean,Q1,Q3
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,20.55,18.75,22.05
4,29.286765,25.0,33.0
5,27.366667,22.85,30.9
6,19.985714,18.0,21.0
8,14.963107,13.0,16.0


### 정규화는 지금 해도 되지만 피처 엔지니어링할 때 해도 됩니다.

In [16]:

# 그룹별 정규화 (z-score)
mpg['mpg_zscore'] = mpg.groupby('cylinders')['mpg'].transform(lambda x: (x - x.mean()) / x.std())
mpg

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kml,mpg_zscore
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,7.65,1.070730
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,6.38,0.013008
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,7.65,1.070730
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,6.80,0.365582
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,7.22,0.718156
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl,11.48,-0.400473
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup,18.70,2.576678
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage,13.60,0.475160
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger,11.90,-0.225347


#### join, concat

In [17]:
# join tables
a = {
  "이름" : ["철수", "영희"],
  "나이" : [3, 5] }

b = pd.DataFrame.from_dict(a)

a
b


{'이름': ['철수', '영희'], '나이': [3, 5]}

Unnamed: 0,이름,나이
0,철수,3
1,영희,5


In [18]:
type(b)

pandas.core.frame.DataFrame

In [19]:
c = pd.DataFrame({
  "이름" : ["철수", "길동"],
  "주소" : ["분당", "일산"]})
c

Unnamed: 0,이름,주소
0,철수,분당
1,길동,일산


In [20]:
  
pd.merge(b, c, how = "left", on = "이름")


Unnamed: 0,이름,나이,주소
0,철수,3,분당
1,영희,5,


In [21]:
pd.merge(b, c, how = "right")


Unnamed: 0,이름,나이,주소
0,철수,3.0,분당
1,길동,,일산


In [22]:
pd.merge(b, c, how = "outer")


Unnamed: 0,이름,나이,주소
0,길동,,일산
1,영희,5.0,
2,철수,3.0,분당


In [23]:
# concatenate
pd.concat([b, c], axis = 1)

Unnamed: 0,이름,나이,이름.1,주소
0,철수,3,철수,분당
1,영희,5,길동,일산
