In [1]:
import pandas as pd
from sklearn.cluster import  KMeans
from sklearn import preprocessing


# 读取数据文件
df = pd.read_table('data7.txt',names= ['id','amount', 'income' , 'datetime', 'age'])
print(df.head(5))

  import sys


      id  amount  income             datetime    age
0  15093    1390   10.40  2017-04-30 19:24:13   0-10
1  15062    4024    4.68  2017-04-27 22:44:59  70-80
2  15028    6359    3.84  2017-04-27 10:07:55  40-50
3  15012    7759    3.70  2017-04-04 07:28:18  30-40
4  15021     331    4.25  2017-04-08 11:14:00  70-80


In [2]:
# 针对时间数据的离散化
df['datetime'] = list(map(pd.to_datetime,df['datetime']))
# 离散化为周几
df['datetime'] = [i.weekday() for i in df['datetime']]
print(df.head())

      id  amount  income  datetime    age
0  15093    1390   10.40         6   0-10
1  15062    4024    4.68         3  70-80
2  15028    6359    3.84         3  40-50
3  15012    7759    3.70         1  30-40
4  15021     331    4.25         5  70-80


In [3]:
# 针对连续数据的离散化：自定义分箱区间实现离散化
bins = [0,200,1000,5000,10000]
# 使用区间左边界离散化
df['amount1'] = pd.cut(df['amount'],bins)
print(df.head())


      id  amount  income  datetime    age        amount1
0  15093    1390   10.40         6   0-10   (1000, 5000]
1  15062    4024    4.68         3  70-80   (1000, 5000]
2  15028    6359    3.84         3  40-50  (5000, 10000]
3  15012    7759    3.70         1  30-40  (5000, 10000]
4  15021     331    4.25         5  70-80    (200, 1000]


In [7]:
# 针对连续数据的离散化：聚类法实现离散化
# 获取聚类的数据
data = df['amount']
# 转化数据形状
data_reshape = data.values.reshape((data.shape[0],1))
# 创建模型并指定聚类数量
model_kmeans = KMeans(n_clusters=4,random_state=0)
# 建模聚类
keames_result = model_kmeans.fit_predict(data_reshape) 
# 新离散化数据合并到元数据框
df['amount2'] = keames_result
print(df.head())


      id  amount  income  datetime    age        amount1  amount2
0  15093    1390   10.40         6   0-10   (1000, 5000]        0
1  15062    4024    4.68         3  70-80   (1000, 5000]        2
2  15028    6359    3.84         3  40-50  (5000, 10000]        1
3  15012    7759    3.70         1  30-40  (5000, 10000]        1
4  15021     331    4.25         5  70-80    (200, 1000]        0


In [8]:
# 针对连续数据的离散化
df['amount3'] = pd.qcut(df['amount'],4,labels=['bad','medium','good','awesome'])
df = df.drop('amount',1)
print(df.head())


      id  income  datetime    age        amount1  amount2  amount3
0  15093   10.40         6   0-10   (1000, 5000]        0      bad
1  15062    4.68         3  70-80   (1000, 5000]        2     good
2  15028    3.84         3  40-50  (5000, 10000]        1  awesome
3  15012    3.70         1  30-40  (5000, 10000]        1  awesome
4  15021    4.25         5  70-80    (200, 1000]        0      bad


In [9]:
# 针对连续数据的二值比
binarizer_scaler = preprocessing.Binarizer(threshold=df['income'].mean())
income_tmp = binarizer_scaler.fit_transform(df[['income']])
income_tmp.resize(df['income'].shape)
df['income'] = income_tmp
print(df.head())

      id  income  datetime    age        amount1  amount2  amount3
0  15093     1.0         6   0-10   (1000, 5000]        0      bad
1  15062     1.0         3  70-80   (1000, 5000]        2     good
2  15028     0.0         3  40-50  (5000, 10000]        1  awesome
3  15012     0.0         1  30-40  (5000, 10000]        1  awesome
4  15021     1.0         5  70-80    (200, 1000]        0      bad
