## 6.15. 카테고리(범주형) 데이터 다루기

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('OTg6QzA_activities.csv')
df.head()

Unnamed: 0,timestamp,X,Y,Z
0,2022-01-01 00:01:40.363,33,76,56
1,2022-01-01 00:01:42.961,34,87,56
2,2022-01-01 00:01:45.562,25,89,47
3,2022-01-01 00:01:48.163,11,70,50
4,2022-01-01 00:02:08.864,33,72,58


In [2]:
# (x,y) = (0, 200)을 기준으로 사분면 정하기
def make_quadrant(df):
  if df['X'] >= 0 and df['Y'] >= 200:
    return "1st"
  elif df['X'] < 0 and df['Y'] >= 200:
    return "2nd"
  elif df['X'] < 0 and df['Y'] < 200:
    return "3rd"
  elif df['X'] >= 0 and df['Y'] < 200:
    return "4th"
  else:
    return "others"

df['quadrant'] = df.apply(make_quadrant, axis=1)
df

Unnamed: 0,timestamp,X,Y,Z,quadrant
0,2022-01-01 00:01:40.363,33,76,56,4th
1,2022-01-01 00:01:42.961,34,87,56,4th
2,2022-01-01 00:01:45.562,25,89,47,4th
3,2022-01-01 00:01:48.163,11,70,50,4th
4,2022-01-01 00:02:08.864,33,72,58,4th
...,...,...,...,...,...
66276,2022-01-31 23:28:00.481,-91,143,136,3rd
66277,2022-01-31 23:28:41.683,-89,145,138,3rd
66278,2022-01-31 23:28:49.383,-93,138,137,3rd
66279,2022-01-31 23:32:36.426,-93,131,137,3rd


In [7]:
df['quadrant_cat'] = df['quadrant'].astype('category')


In [8]:
df.dtypes


timestamp         object
X                  int64
Y                  int64
Z                  int64
quadrant          object
quadrant_cat    category
dtype: object

In [10]:
df['quadrant'].nbytes

530248

In [11]:
df['quadrant_cat'].nbytes

66313

In [12]:
df['quadrant_cat'].nbytes/df['quadrant'].nbytes

0.12506034911965722

In [13]:
df['quadrant_cat'].cat.codes

0        3
1        3
2        3
3        3
4        3
        ..
66276    2
66277    2
66278    2
66279    2
66280    2
Length: 66281, dtype: int8

In [14]:
df['quadrant_cat'].cat.categories

Index(['1st', '2nd', '3rd', '4th'], dtype='object')

In [15]:
df['quadrant_cat'].cat.codes.unique()

array([3, 2, 0, 1], dtype=int8)

In [16]:
from enum import Enum
class Quadrant(Enum):
  Q1st = 1
  Q2dn = 2
  Q3rd = 3
  Q4th = 4
  Others = 5

  def __lt__(self, other):
    if self.__class__ is other.__class__:
      return self.value < other.value
    return NotImplemented

for q in Quadrant:
  print(q.name + " : " + str(q.value))

Q1st : 1
Q2dn : 2
Q3rd : 3
Q4th : 4
Others : 5


In [21]:
# (x,y) = (0, 200)을 기준으로 사분면 정하기
def make_quadrant_with_enum(df):
  if df['X'] >= 0 and df['Y'] >= 200:
    return Quadrant.Q1st
  elif df['X'] < 0 and df['Y'] >= 200:
    return Quadrant.Q2dn
  elif df['X'] < 0 and df['Y'] < 200:
    return Quadrant.Q3rd
  elif df['X'] >= 0 and df['Y'] < 200:
    return Quadrant.Q4th
  else:
    return Quadrant.Others

df['quadrant_enum'] = df.apply(make_quadrant_with_enum, axis=1)
df

Unnamed: 0,timestamp,X,Y,Z,quadrant,quadrant_cat,quadrant_enum
0,2022-01-01 00:01:40.363,33,76,56,4th,4th,Quadrant.Q4th
1,2022-01-01 00:01:42.961,34,87,56,4th,4th,Quadrant.Q4th
2,2022-01-01 00:01:45.562,25,89,47,4th,4th,Quadrant.Q4th
3,2022-01-01 00:01:48.163,11,70,50,4th,4th,Quadrant.Q4th
4,2022-01-01 00:02:08.864,33,72,58,4th,4th,Quadrant.Q4th
...,...,...,...,...,...,...,...
66276,2022-01-31 23:28:00.481,-91,143,136,3rd,3rd,Quadrant.Q3rd
66277,2022-01-31 23:28:41.683,-89,145,138,3rd,3rd,Quadrant.Q3rd
66278,2022-01-31 23:28:49.383,-93,138,137,3rd,3rd,Quadrant.Q3rd
66279,2022-01-31 23:32:36.426,-93,131,137,3rd,3rd,Quadrant.Q3rd


In [22]:
df.dtypes

timestamp          object
X                   int64
Y                   int64
Z                   int64
quadrant           object
quadrant_cat     category
quadrant_enum      object
dtype: object

In [23]:
df['quadrant_enum'].nbytes

530248

In [24]:
df['quadrant_enum'].iloc[0].name

'Q4th'

In [25]:
df['quadrant_enum'].iloc[0].value

4

In [26]:
df.dtypes

timestamp          object
X                   int64
Y                   int64
Z                   int64
quadrant           object
quadrant_cat     category
quadrant_enum      object
dtype: object

In [27]:
%%timeit
df.groupby('quadrant').agg(['count', 'mean', 'median'])



44.3 ms ± 3.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)




In [28]:
%%timeit
df.groupby('quadrant_cat').agg(['count', 'mean', 'median'])




47.7 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)




In [29]:
%%timeit
df.groupby('quadrant_enum').agg(['count', 'mean', 'median'])



62.5 ms ± 1.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
