## 1.2 データの種類とpandasのデータ型

In [1]:
import pandas as pd
import numpy as np 

N = 5

df = pd.DataFrame({
    'list_int': list(range(N)),
    'list_float': [i*0.1 for i in range(N)],
    'np_float16': np.linspace(0, 1., num=N, dtype='float16'),
    'np_float32': np.linspace(0, 1., num=N, dtype='float32'),
    'category_animal': ('cat', 'dog', 'dog', 'cat', 'cat'),
    'category_size': ['MIDDLE', 'LARGE', 'SMALL', 'EXTRA-SMALL', 'MIDDLE'],
    'date': ['1981-03-05', '1993-04-10', '2005-07-15', '2017-10-20', '2029-12-25']
})

df

Unnamed: 0,list_int,list_float,np_float16,np_float32,category_animal,category_size,date
0,0,0.0,0.0,0.0,cat,MIDDLE,1981-03-05
1,1,0.1,0.25,0.25,dog,LARGE,1993-04-10
2,2,0.2,0.5,0.5,dog,SMALL,2005-07-15
3,3,0.3,0.75,0.75,cat,EXTRA-SMALL,2017-10-20
4,4,0.4,1.0,1.0,cat,MIDDLE,2029-12-25


In [2]:
df.dtypes

list_int             int64
list_float         float64
np_float16         float16
np_float32         float32
category_animal     object
category_size       object
date                object
dtype: object

In [3]:
df_change = df.astype({
    'list_int': 'int32',
    'np_float16': 'float64'
})

df_change.dtypes 

list_int             int32
list_float         float64
np_float16         float64
np_float32         float32
category_animal     object
category_size       object
date                object
dtype: object

In [4]:
df_change = df_change.astype({
    'category_animal': 'string',
    'category_size': 'string'
})

df_change.dtypes 

list_int             int32
list_float         float64
np_float16         float64
np_float32         float32
category_animal     string
category_size       string
date                object
dtype: object

In [5]:
df = df.astype({
    'category_animal': 'category',
    'category_size': 'category'
})

df.dtypes

list_int              int64
list_float          float64
np_float16          float16
np_float32          float32
category_animal    category
category_size      category
date                 object
dtype: object

In [6]:
df['category_animal'] = df['category_animal'].cat.set_categories(
    ['cat', 'dog'],
    ordered=False)

df['category_animal'].values

['cat', 'dog', 'dog', 'cat', 'cat']
Categories (2, object): ['cat', 'dog']

In [7]:
type(df['category_animal'].values)

pandas.core.arrays.categorical.Categorical

In [8]:
df['category_size'] = df['category_size'].cat.set_categories(
    ['EXTRA-SMALL', 'SMALL', 'MIDDLE', 'LARGE', 'EXTRA-LARGE'],
    ordered=True)

df['category_size'].values

['MIDDLE', 'LARGE', 'SMALL', 'EXTRA-SMALL', 'MIDDLE']
Categories (5, object): ['EXTRA-SMALL' < 'SMALL' < 'MIDDLE' < 'LARGE' < 'EXTRA-LARGE']

In [9]:
df['category_size'].values.codes

array([2, 3, 1, 0, 2], dtype=int8)

In [10]:
df['date'] = pd.to_datetime(df['date'])

df.dtypes

list_int                    int64
list_float                float64
np_float16                float16
np_float32                float32
category_animal          category
category_size            category
date               datetime64[ns]
dtype: object

In [11]:
categories_animal = ['cat', 'dog']
categories_size = ['EXTRA-SMALL', 'SMALL', 'MIDDLE', 'LARGE', 'EXTRA-LARGE']

df = pd.DataFrame({
    'category_animal': pd.Categorical(
        ('cat', 'dog', 'dog', 'cat', 'cat'),
        ordered=False,
        categories=categories_animal),
    'category_size': pd.Categorical(
        ['MIDDLE', 'LARGE', 'SMALL', 'EXTRA-SMALL', 'MIDDLE'],
        ordered=True,
        categories=categories_size),
    'date': pd.to_datetime(['1981-03-05', '1993-04-10', '2005-07-15', '2017-10-20', '2029-12-25'])
})

df.dtypes

category_animal          category
category_size            category
date               datetime64[ns]
dtype: object