# 8-1 자료형 다루기

In [1]:
import pandas as pd
import seaborn as sns

tips = sns.load_dataset('tips')

In [2]:
#여러가지 자료형을 문자열로 변환하기
#판다스에서 문자열은 object
tips['sex_str'] = tips['sex'].astype(str)
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [3]:
#자료형 변환된 데이터를 다시 원래대로 만들기
tips['total_bill'] = tips['total_bill'].astype(str)
tips.dtypes

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [4]:
tips['total_bill'] = tips['total_bill'].astype(float)
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [8]:
# 잘못 입력한 데이터 처리하기
tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1,3,5,7], 'total_bill'] = 'missing'
tips_sub_miss

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_str
0,16.99,1.01,Female,No,Sun,Dinner,2,Female
1,missing,1.66,Male,No,Sun,Dinner,3,Male
2,21.01,3.5,Male,No,Sun,Dinner,3,Male
3,missing,3.31,Male,No,Sun,Dinner,2,Male
4,24.59,3.61,Female,No,Sun,Dinner,4,Female
5,missing,4.71,Male,No,Sun,Dinner,4,Male
6,8.77,2.0,Male,No,Sun,Dinner,2,Male
7,missing,3.12,Male,No,Sun,Dinner,4,Male
8,15.04,1.96,Male,No,Sun,Dinner,2,Male
9,14.78,3.23,Male,No,Sun,Dinner,2,Male


In [9]:
tips_sub_miss.dtypes    

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [10]:
#문자열을 실수로 변환하면 오류가 발생 
tips_sub_miss['total_bill'].astype(float)

ValueError: could not convert string to float: 'missing'

In [11]:
# 문자열을 실수로 바꿀때는 to_numeric 메서드를 사용 
# 인자 errors='ignore' 는 오류를 무시한다는 것
tips_sub_miss['total_bill']=pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')
tips_sub_miss.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [12]:
# 문자열을 실수로 바꿀때는 to_numeric 메서드를 사용 
# 인자 errors='coerce' 는 missing 이 누락값으로 변경
tips_sub_miss['total_bill']=pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce')
tips_sub_miss.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

In [13]:
# 문자열을 실수로 바꿀때는 to_numeric 메서드를 사용 
# 인자 downcast='float' 는 정수, 실수와 같은 자료형을 더 작은 형태로 만드는 것으로 
#더 많은 실수의 범위를 표현 할 수 있는 것 만큼 저장공간도 2배로 든다. 
tips_sub_miss['total_bill']=pd.to_numeric(tips_sub_miss['total_bill'], downcast='float')
tips_sub_miss.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


total_bill     float32
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

# 8-2 카테고리 자료형

In [14]:
#카테고리 자료형의 장점과 특징
#용량과 속도면에서 매우 효율적이다. 
#주로 동일한 문자열이 반복되어 데이터를 구성하는 경우에 사용합니다. 
tips['sex']=tips['sex'].astype('str')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null object
smoker        244 non-null category
day           244 non-null category
time          244 non-null category
size          244 non-null int64
sex_str       244 non-null object
dtypes: category(3), float64(2), int64(1), object(2)
memory usage: 10.7+ KB


In [15]:
#카테고리형으로 바꾸기
tips['sex']=tips['sex'].astype('category')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null category
time          244 non-null category
size          244 non-null int64
sex_str       244 non-null object
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 9.1+ KB
