## 라이브러리 로드

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
print(pd.__version__)
print(sns.__version__)

1.2.4
0.11.1


In [None]:
# 0.11.0 버전에서 변화가 많으니 이 버전 이상을 사용해 주세요. 
# !pip install seaborn --upgrade

## 데이터셋 불러오기
<img src="https://pandas.pydata.org/docs/_images/02_io_readwrite.svg">

In [4]:
# df
# 주석 푸는 방법 => cmd+/
df=sns.load_dataset("mpg")
df.shape

(398, 9)

In [5]:
# index 값만 보기

df.index

RangeIndex(start=0, stop=398, step=1)

In [6]:
# columns 값만 보기
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

In [7]:
# values 값만 보기
df.values

array([[18.0, 8, 307.0, ..., 70, 'usa', 'chevrolet chevelle malibu'],
       [15.0, 8, 350.0, ..., 70, 'usa', 'buick skylark 320'],
       [18.0, 8, 318.0, ..., 70, 'usa', 'plymouth satellite'],
       ...,
       [32.0, 4, 135.0, ..., 82, 'usa', 'dodge rampage'],
       [28.0, 4, 120.0, ..., 82, 'usa', 'ford ranger'],
       [31.0, 4, 119.0, ..., 82, 'usa', 'chevy s-10']], dtype=object)

In [16]:
# 데이터 타입만 보기
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
name             object
dtype: object

## 데이터셋 일부만 가져오기

In [10]:
# head 을 통해 일부만 가져오기
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [14]:
# tail 을 통해 일부만 가져오기
df.tail(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


In [13]:
# sample을 통해 일부만 가져오기
df.sample(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
392,27.0,4,151.0,90.0,2950,17.3,82,usa,chevrolet camaro
92,13.0,8,351.0,158.0,4363,13.0,73,usa,ford ltd
176,19.0,6,232.0,90.0,3211,17.0,75,usa,amc pacer


## 요약하기

In [15]:
# info 를 통해 요약정보 보기
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


## 결측치 확인

In [17]:
# 결측치 수 확인
df.isnull()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
393,False,False,False,False,False,False,False,False,False
394,False,False,False,False,False,False,False,False,False
395,False,False,False,False,False,False,False,False,False
396,False,False,False,False,False,False,False,False,False


In [18]:
# 결측치 비율 확인
df.isna().mean()*100

mpg             0.000000
cylinders       0.000000
displacement    0.000000
horsepower      1.507538
weight          0.000000
acceleration    0.000000
model_year      0.000000
origin          0.000000
name            0.000000
dtype: float64

## 기술통계

In [20]:
# describe 로 기술통계 확인하기
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [22]:
# count 빈도수
# mean 평균값
# std 표준편차
# min 최솟값
# 5% 위치에 있는 값
# 50% 중앙값
# max 최댓값

In [23]:
df.describe(include="object")

Unnamed: 0,origin,name
count,398,398
unique,3,305
top,usa,ford pinto
freq,249,6


## Series

In [24]:
df["name"]

0      chevrolet chevelle malibu
1              buick skylark 320
2             plymouth satellite
3                  amc rebel sst
4                    ford torino
                 ...            
393              ford mustang gl
394                    vw pickup
395                dodge rampage
396                  ford ranger
397                   chevy s-10
Name: name, Length: 398, dtype: object

In [25]:
type(df["name"])

pandas.core.series.Series

In [28]:
type(df[["name"]])

pandas.core.frame.DataFrame

## DataFrame

In [26]:
df[["mpg","name"]]

Unnamed: 0,mpg,name
0,18.0,chevrolet chevelle malibu
1,15.0,buick skylark 320
2,18.0,plymouth satellite
3,16.0,amc rebel sst
4,17.0,ford torino
...,...,...
393,27.0,ford mustang gl
394,44.0,vw pickup
395,32.0,dodge rampage
396,28.0,ford ranger


In [27]:
type(df[["mpg","name"]])

pandas.core.frame.DataFrame

## loc

In [30]:
# loc 로 하나의 행을 가져옵니다.
df.loc[0]

mpg                                  18.0
cylinders                               8
displacement                        307.0
horsepower                          130.0
weight                               3504
acceleration                         12.0
model_year                             70
origin                                usa
name            chevrolet chevelle malibu
Name: 0, dtype: object

In [32]:
# loc로 2개의 행을 가져와 봅니다.
df.loc[[0,1]]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320


In [33]:
# loc로 행, 열을 가져와 봅니다.
df.loc[0,"name"]

'chevrolet chevelle malibu'

In [35]:
# loc로 여러개의 행, 열을 가져와 봅니다.
df.loc[[0,1,2],["mpg","name"]]

Unnamed: 0,mpg,name
0,18.0,chevrolet chevelle malibu
1,15.0,buick skylark 320
2,18.0,plymouth satellite


In [37]:
#loc[인덱스값,컬럼값]
#iloc[인덱스번호, 컬럼인덱스번호]
df.iloc[:3,:3]

Unnamed: 0,mpg,cylinders,displacement
0,18.0,8,307.0
1,15.0,8,350.0
2,18.0,8,318.0


In [38]:
df.loc[:10,"mpg":"weight"]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight
0,18.0,8,307.0,130.0,3504
1,15.0,8,350.0,165.0,3693
2,18.0,8,318.0,150.0,3436
3,16.0,8,304.0,150.0,3433
4,17.0,8,302.0,140.0,3449
5,15.0,8,429.0,198.0,4341
6,14.0,8,454.0,220.0,4354
7,14.0,8,440.0,215.0,4312
8,14.0,8,455.0,225.0,4425
9,15.0,8,390.0,190.0,3850
