In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [4]:
#针对时间数据的离散化
df = pd.read_table('data7.txt',names=['id','amount','income','datetime','age'])
print(df.head(5))

      id  amount  income             datetime    age
0  15093    1390   10.40  2017-04-30 19:24:13   0-10
1  15062    4024    4.68  2017-04-27 22:44:59  70-80
2  15028    6359    3.84  2017-04-27 10:07:55  40-50
3  15012    7759    3.70  2017-04-04 07:28:18  30-40
4  15021     331    4.25  2017-04-08 11:14:00  70-80


In [6]:
for i, single_data in enumerate(df['datetime']):
    single_date_tmp = pd.to_datetime(single_data)
    df['datetime'][i] = single_date_tmp.weekday()
print(df.head(5))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


      id  amount  income datetime    age
0  15093    1390   10.40        3   0-10
1  15062    4024    4.68        3  70-80
2  15028    6359    3.84        3  40-50
3  15012    7759    3.70        3  30-40
4  15021     331    4.25        3  70-80


In [7]:
#针对多指数据的离散化
map_df = pd.DataFrame([['0-10', '0-40'], ['10-20', '0-40'], ['20-30', '0-40'], ['30-40', '0-40'], ['40-50', '40-80'],
                       ['50-60', '40-80'], ['60-70', '40-80'], ['70-80', '40-80'], ['80-90', '>80'], ['>90', '>80']],
                      columns=['age', 'age2']) 
df_tmp = df.merge(map_df, left_on='age', right_on='age', how='inner')
df = df_tmp.drop('age',1)
print(df.head(5))

      id  amount  income datetime  age2
0  15093    1390   10.40        3  0-40
1  15064    7952    4.40        3  0-40
2  15080     503    5.72        3  0-40
3  15068    1668    3.19        3  0-40
4  15019    6710    3.20        3  0-40


In [5]:
#针对连续数据的离散化
#方法1：自定义分箱区间实现离散化
bins = [0,200,1000,5000,10000]
df['amount1'] = pd.cut(df['amount'],bins)
print(df.head(5))

      id  amount  income datetime  age2        amount1
0  15093    1390   10.40        6  0-40   (1000, 5000]
1  15064    7952    4.40        0  0-40  (5000, 10000]
2  15080     503    5.72        5  0-40    (200, 1000]
3  15068    1668    3.19        5  0-40   (1000, 5000]
4  15019    6710    3.20        0  0-40  (5000, 10000]


In [6]:
#方法2：使用聚类法实现离散
data = df['amount']
data_reshape = data.values.reshape(data.shape[0],1)
model_kmeans = KMeans(n_clusters=4, random_state=0)
kmeans_result = model_kmeans.fit_predict(data_reshape)
df['amount2'] = kmeans_result
print(df.head(3))

      id  amount  income datetime  age2        amount1  amount2
0  15093    1390   10.40        6  0-40   (1000, 5000]        2
1  15064    7952    4.40        0  0-40  (5000, 10000]        1
2  15080     503    5.72        5  0-40    (200, 1000]        2


In [7]:
#方法3：使用4分位数实现离散化
df['amount3'] = pd.qcut(df['amount'], 4, labels=['bad', 'medium', 'good', 'awesome'])  # 按四分位数进行分隔
df = df.drop('amount', 1)  # 丢弃名为amount的列
print (df.head(5))

      id  income datetime  age2        amount1  amount2  amount3
0  15093   10.40        6  0-40   (1000, 5000]        2      bad
1  15064    4.40        0  0-40  (5000, 10000]        1  awesome
2  15080    5.72        5  0-40    (200, 1000]        2      bad
3  15068    3.19        5  0-40   (1000, 5000]        2      bad
4  15019    3.20        0  0-40  (5000, 10000]        1  awesome


In [8]:
# 针对连续数据的二值化
data_shap = df['income']
data_shap_new = data_shap.values.reshape(-1, 1)
binarizer_scaler = preprocessing.Binarizer(threshold=data_shap_new.mean())  # 建立Binarizer模型对象
income_tmp = binarizer_scaler.fit_transform(data_shap_new)  # Binarizer标准化转换
income_tmp.resize(data_shap_new.shape)  # 转换数据形状
df['income'] = income_tmp  # Binarizer标准化转换
print (df.head(5))  # 打印输出前5条数据

      id  income datetime  age2        amount1  amount2  amount3
0  15093     1.0        6  0-40   (1000, 5000]        2      bad
1  15064     1.0        0  0-40  (5000, 10000]        1  awesome
2  15080     1.0        5  0-40    (200, 1000]        2      bad
3  15068     0.0        5  0-40   (1000, 5000]        2      bad
4  15019     0.0        0  0-40  (5000, 10000]        1  awesome
