# Pandas 离散化 和 元面划分

In [292]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [264]:
# 准备 年龄数据
ages = np.random.randint(18,100,20)

# 元面区间
bins = [18,25,35,60,80,100]

In [234]:
ages

array([54, 95, 38, 75, 23, 22, 99, 76, 33, 89, 22, 58, 37, 71, 51, 25, 84,
       91, 85, 32])

In [149]:
# 获得元面划分后的对象
cats = pd.cut(ages,bins)

In [150]:
cats

[(80, 100], (60, 80], (35, 60], (18, 25], (18, 25], ..., (18, 25], (60, 80], (35, 60], (80, 100], (80, 100]]
Length: 20
Categories (5, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 80] < (80, 100]]

In [151]:
# 数据的标签
cats.codes

array([4, 3, 2, 0, 0, 3, 2, 2, 3, 2, 2, 3, 4, 3, 4, 0, 3, 2, 4, 4],
      dtype=int8)

In [152]:
# 区间的信息
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 80], (80, 100]],
              closed='right',
              dtype='interval[int64]')

In [164]:
# 每个区间的数据量
pd.value_counts(cats)

(60, 80]     6
(35, 60]     6
(80, 100]    5
(18, 25]     3
(25, 35]     0
dtype: int64

In [153]:
x = pd.value_counts(cats)

In [163]:
type(cats)

pandas.core.arrays.categorical.Categorical

In [165]:
# 自定义区间名称
cats2 = pd.cut(ages,bins,labels=['young0','young1','middle','eldery0','eldery1'])

In [167]:
cats2.codes

array([4, 3, 2, 0, 0, 3, 2, 2, 3, 2, 2, 3, 4, 3, 4, 0, 3, 2, 4, 4],
      dtype=int8)

In [169]:
cats2.categories

Index(['young0', 'young1', 'middle', 'eldery0', 'eldery1'], dtype='object')

In [170]:
pd.value_counts(cats2)

eldery0    6
middle     6
eldery1    5
young0     3
young1     0
dtype: int64

In [247]:
# 指定 区间份数的 元面划分
cats3 = pd.cut(ages,4,precision=1)

In [248]:
pd.value_counts(cats3)

(21.9, 41.2]    8
(79.8, 99.0]    6
(60.5, 79.8]    3
(41.2, 60.5]    3
dtype: int64

In [269]:
# qcut()
cats4 = pd.qcut(ages,4)

In [270]:
pd.value_counts(cats4)

(20.999, 53.0]    6
(90.5, 99.0]      5
(79.5, 90.5]      5
(53.0, 79.5]      4
dtype: int64

In [271]:
cats4.codes

array([1, 2, 0, 2, 2, 0, 2, 0, 0, 2, 1, 1, 1, 0, 3, 0, 3, 3, 3, 3],
      dtype=int8)

In [291]:
cats4.categories

IntervalIndex([(20.999, 53.0], (53.0, 79.5], (79.5, 90.5], (90.5, 99.0]],
              closed='right',
              dtype='interval[float64]')

In [288]:
pd.value_counts(cats4)

(20.999, 53.0]    6
(90.5, 99.0]      5
(79.5, 90.5]      5
(53.0, 79.5]      4
dtype: int64

In [355]:
# 计算指标/哑变量
df = pd.DataFrame({'key':['b','b','a','c','a','d'],'data1':range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,d,5


In [356]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c,d
0,0,1,0,0
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0
4,1,0,0,0
5,0,0,0,1


In [357]:
# 分箱后 做独热编码 (直接将 分箱结果传入 dummies)
pd.get_dummies(cats)

Unnamed: 0,"(18, 25]","(25, 35]","(35, 60]","(60, 80]","(80, 100]"
0,0,0,0,0,1
1,0,0,0,1,0
2,0,0,1,0,0
3,1,0,0,0,0
4,1,0,0,0,0
5,0,0,0,1,0
6,0,0,1,0,0
7,0,0,1,0,0
8,0,0,0,1,0
9,0,0,1,0,0
