# Ch-08. 판다스 자료형
## 08-1. 자료형 다루기

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

In [3]:
path = os.getcwd() + '\data'
os.chdir(path)
os.getcwd()

'C:\\Users\\James\\Documents\\GitHub\\doit_pandas\\data'

### - (p.175) 자료형 변환: astype()

In [5]:
tips_raw = sns.load_dataset('tips')

In [32]:
tips = tips_raw.copy()
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [33]:
tips.shape

(244, 7)

In [34]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [35]:
tips['sex'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 244 entries, 0 to 243
Series name: sex
Non-Null Count  Dtype   
--------------  -----   
244 non-null    category
dtypes: category(1)
memory usage: 496.0 bytes


In [39]:
# dtype은 저장된 데이터값 자체의 자료형
# (ex) int, str(object), category, etc... 
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [40]:
tips['sex'].dtypes

CategoricalDtype(categories=['Male', 'Female'], ordered=False)

In [41]:
# type()은 데이터가 저장된 그릇(형식)의 자료형 
# (ex) DataFrame, Series, List, Dict, etc...
type(tips)

pandas.core.frame.DataFrame

In [42]:
type(tips['sex'])

pandas.core.series.Series

#### * to str(object)

In [43]:
tips['sex_str'] = tips['sex'].astype(str)

In [44]:
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [46]:
tips['sex_str'].dtypes

dtype('O')

#### * to float

In [47]:
tips['total_bill'] = tips['total_bill'].astype(str)

In [47]:
tips.dtypes

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [50]:
tips['total_bill'] = tips['total_bill'].astype(float)

In [50]:
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

### - (p.177) 잘못 입력한 문자열 처리 : to_numeric()

#### (1) float 열에 문자 데이터 잘못 입력

In [54]:
tips_sub_miss = tips_raw.copy()

In [56]:
tips_sub_miss.loc[[1, 3, 5, 7], 'total_bill'] = 'missing'

In [58]:
tips_sub_miss.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,missing,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,missing,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,missing,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,missing,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


#### (2) 열의 자료형이 object 로 바뀐 것 확인

In [60]:
tips_sub_miss.dtypes

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

#### (3) astype() 메서드 사용하면 오류 발생

In [None]:
# Error
tips_sub_miss['total_bill'].astype(float)

#### (4) pd.to_numeric() 메서드도 그냥 사용하면 오류 발생

In [None]:
# Error
pd.to_numeric(tips_sub_miss['total_bill'])

#### (5) errors 인자 설정

##### * (defalut) raise: 숫자로 변환할 수 없는 값이 있으면 오류 발생

In [None]:
# Error (errors 인자의 defalut값은 raise)
pd.to_numeric(tips_sub_miss['total_bill'])

##### * ignore: 아무 작업도 하지 않음

In [64]:
tips_sub_miss['total_miss'] = pd.to_numeric(tips_sub_miss['total_bill'], errors = 'ignore')

In [66]:
# 에러가 발생하지 않았지만, 자료형도 바뀌지 않음
tips_sub_miss.dtypes

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
total_miss      object
dtype: object

##### * coerce: 숫자로 변환할 수 없는 값을 누락값으로 지정

In [68]:
tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], errors = 'coerce')

In [70]:
# 자료형이 object -> float 로 변경됨
tips_sub_miss.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
total_miss      object
dtype: object

In [72]:
# 'missing' 값이 NaN 으로 변경됨
tips_sub_miss.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_miss
0,16.99,1.01,Female,No,Sun,Dinner,2,16.99
1,,1.66,Male,No,Sun,Dinner,3,missing
2,21.01,3.5,Male,No,Sun,Dinner,3,21.01
3,,3.31,Male,No,Sun,Dinner,2,missing
4,24.59,3.61,Female,No,Sun,Dinner,4,24.59
5,,4.71,Male,No,Sun,Dinner,4,missing
6,8.77,2.0,Male,No,Sun,Dinner,2,8.77
7,,3.12,Male,No,Sun,Dinner,4,missing
8,15.04,1.96,Male,No,Sun,Dinner,2,15.04
9,14.78,3.23,Male,No,Sun,Dinner,2,14.78


In [77]:
tips_sub_miss.isnull().sum()

total_bill    4
tip           0
sex           0
smoker        0
day           0
time          0
size          0
total_miss    0
dtype: int64

#### (6) downcast 인자: 정수/실수 -> 더 작은 형태로

In [82]:
# downcast = 'integer', 'signed', 'unsigned', 'float' 등

In [80]:
# float64 -> float32 로 변경
# 메모리 공간 2배 축소
tips_sub_miss['total_bill'] = pd.to_numeric(tips_sub_miss['total_bill'], 
                                            errors = 'coerce', 
                                            downcast = 'float')

In [79]:
tips_sub_miss.dtypes

total_bill     float32
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
total_miss      object
dtype: object

## 08-2. 카테고리 자료형

### - (p.182) 문자열 vs. 카테고리

In [93]:
# 카테고리 자료형
# (1) 장점: 용량과 속도 면에서 더 효율적
# (2) 특징: 주로 동일한 문자열이 반복되어 데이터를 구성하는 경우에 사용

In [91]:
tips = tips_raw.copy()

In [94]:
tips.info()
# memory usage: 7.4 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [95]:
# 카테고리 -> 문자열로 변경
tips['sex'] = tips['sex'].astype('str')

In [96]:
tips.info()
# memory usage: 8.9+ KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    object  
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(3), float64(2), int64(1), object(1)
memory usage: 8.9+ KB


In [97]:
tips['sex'] = tips['sex'].astype('category')

In [99]:
tips.info()
# memory usage: 7.4 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


## - End.