# 用决策树对优惠券使用进行预测
为了贴近实际生活和应用，本课程作业以实际数据集的处理为主。提供用户在2016年1月1日至2016年6月30日之间真实线上线下消费行为，预测用户在2016年7月领取优惠券后15天以内的是否使用。 注意： 为了保护用户和商家的隐私，所有数据均作匿名处理，同时采用了有偏采样和必要过滤。我们本次练习使用的数据集是ccf_offline_stage1_train.csv，数据及特征描述如下所示：

User_id：用户ID Merchant_id：商户ID Coupon_id：优惠券ID；null表示无优惠券消费，此时Discount_rate和Date_received字段无意义 Discount_rate：优惠率；x在[0,1]范围内时，代表折扣率；x:y表示满x减y。单位是元 Distance：user经常活动的地点离该merchant的最近门店距离是x*500米（如果是连锁店，则取最近的一家门店），x在[0,10]区间；null表示无此信息，0表示低于500米，10表示大于5公里； Date_received：领取优惠券日期 Date：消费日期，（Date - Date_received <= 15) 表示领取优惠券且在15天内使用，即正样本，y = 1；(Date - Date_received > 15)表示领取优惠券未在15天内使用，即负样本，y = 0

# 导入读取数据的工具包

In [1]:
import pandas as pd
import numpy as np

# 读取数据集
数据放在/data/course_data/AI路径下,我们先读取数据

In [2]:
train_data = pd.read_csv('/data/course_data/AI/ccf_offline_stage1_train.csv')

# 查看数据情况
读取数据后，一般首先看一下数据的基本情况，这样我们能够对数据有一个整体的了解

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
 #   Column         Dtype  
---  ------         -----  
 0   User_id        int64  
 1   Merchant_id    int64  
 2   Coupon_id      float64
 3   Discount_rate  object 
 4   Distance       float64
 5   Date_received  float64
 6   Date           float64
dtypes: float64(4), int64(2), object(1)
memory usage: 93.7+ MB


# 查看前5条样本

In [4]:
train_data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,
2,1439408,2632,8591.0,20:1,0.0,20160217.0,
3,1439408,2632,1078.0,20:1,0.0,20160319.0,
4,1439408,2632,8591.0,20:1,0.0,20160613.0,


In [5]:
print(train_data.shape)

(1754884, 7)


In [6]:
train_data.isnull().sum()

User_id               0
Merchant_id           0
Coupon_id        701602
Discount_rate    701602
Distance         106003
Date_received    701602
Date             977900
dtype: int64

# 问题一：丢弃带有缺失值的数据

In [7]:
# train_data.dropna(subset=['Coupon_id'], inplace=True)
# train_data['Date'].fillna('20991231', inplace=True)
# train_data['Distance'].fillna(train_data['Distance'].mode()[0], inplace=True)

train_data.dropna(how='any', inplace=True)

In [8]:
print(train_data.shape)# 打印删除缺失数据后的数据大小

(67165, 7)


### Discount_rate是object类型的，object在pandas中代表字符串，字符串类型不能输入模型中，所以需要改为数值类型

In [9]:
# 查看Discount_rate特征的唯一值
# [0,1] 表示折扣率
# x:y 表示满 x 减 y
print('Discount_rate 类型：\n',train_data['Discount_rate'].unique())

Discount_rate 类型：
 ['20:1' '20:5' '30:5' '50:10' '10:5' '50:20' '100:10' '30:10' '50:5'
 '30:1' '100:30' '0.8' '200:30' '100:20' '10:1' '200:20' '0.95' '5:1'
 '100:5' '100:50' '50:1' '20:10' '150:10' '0.9' '200:50' '150:20' '150:50'
 '200:5' '300:30' '100:1' '200:10' '150:30' '0.85' '0.6' '0.5' '300:20'
 '200:100' '300:50' '150:5' '300:10' '0.75' '0.7' '30:20' '50:30']


### 打折类型
### x:y 表示满 x 减 y，          将 x:y 类型的字符串设为1
### [0,1] 表示折扣率，           将 [0,1] 类型的字符串设为 0

In [10]:
# 定义转换Discount_rate特征的函数
def getDiscountType(row):
    if ':' in row:
        return 1
    else:
        return 0

In [11]:
def getDiscout(row):
    if ':' in row:
        x1, x2 = row.split(':')
        return float(x2) / float(x1)
    else:
        return row

In [12]:
def getDiscountCond(row):
    if ':' in row:
        x1, x2 = row.split(':')
        return float(x1)
    else:
        return 0

In [13]:
def getDiscountLimit(row):
    if ':' in row:
        x1, x2 = row.split(':')
        return float(x2)
    else:
        return 99999

# 问题二：将 Discount_rate 转化为数值特征

In [14]:
train_data['Discount_type'] = train_data['Discount_rate'].apply(getDiscountType)
train_data['DiscountCond'] = train_data['Discount_rate'].apply(getDiscountCond)
train_data['DiscountLimit'] = train_data['Discount_rate'].apply(getDiscountLimit)
train_data['Discount_rate'] = train_data['Discount_rate'].apply(getDiscout)

In [15]:
train_data.info()# 打印将Discount_rate特征转换为数值型特征后，数据的基本信息

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67165 entries, 6 to 1754880
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   User_id        67165 non-null  int64  
 1   Merchant_id    67165 non-null  int64  
 2   Coupon_id      67165 non-null  float64
 3   Discount_rate  67165 non-null  object 
 4   Distance       67165 non-null  float64
 5   Date_received  67165 non-null  float64
 6   Date           67165 non-null  float64
 7   Discount_type  67165 non-null  int64  
 8   DiscountCond   67165 non-null  float64
 9   DiscountLimit  67165 non-null  float64
dtypes: float64(6), int64(3), object(1)
memory usage: 5.6+ MB


# 导入模型，划分数据集的包

In [16]:
# 导入DecisionTreeClassifier模型
from sklearn.tree import DecisionTreeClassifier
# 导入 train_test_split，用于划分数据集和测试集
from sklearn.model_selection import train_test_split
# 导入 accuracy_score 评价指标
from sklearn.metrics import accuracy_score, roc_auc_score

# 问题三：为数据集添加一个label列

### 标注标签Label，标注哪些样本是正样本 y=1，哪些是负样本 y = 0
### 预测目标：用户在领取优惠券之后 15 之内的消费情况
### (Date - Date_received <= 15) 表示领取优惠券且在15天内使用，即正样本，y = 1
### (Date - Date_received > 15)   表示领取优惠券未在15天内使用，即负样本，y = 0

In [17]:
# 定义样本标签函数
def label(row):
    
    td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
    
    if td <= pd.Timedelta(15, 'D'): # 如果消费日期和领取优惠券的时间间隔小于等于15，则标签设置为1
        return 1
    else:
        return 0

train_data['label'] = train_data.apply(label, axis=1)

In [18]:
# Date_received	Date
train_data['Date_received_year'] = pd.to_datetime(train_data['Date_received'], format='%Y%m%d').dt.year
train_data['Date_received_month'] = pd.to_datetime(train_data['Date_received'], format='%Y%m%d').dt.month
train_data['Date_received_day'] = pd.to_datetime(train_data['Date_received'], format='%Y%m%d').dt.day
train_data['Date_received_weekday'] = pd.to_datetime(train_data['Date_received'], format='%Y%m%d').dt.weekday
train_data['Date_received_dayofyear'] = pd.to_datetime(train_data['Date_received'], format='%Y%m%d').dt.dayofyear

In [19]:
train_data[['Date_received', 'Date_received_year', 'Date_received_month', 'Date_received_day', 'Date_received_weekday']]

Unnamed: 0,Date_received,Date_received_year,Date_received_month,Date_received_day,Date_received_weekday
6,20160516.0,2016,5,16,0
33,20160515.0,2016,5,15,6
38,20160321.0,2016,3,21,0
69,20160523.0,2016,5,23,0
75,20160127.0,2016,1,27,2
...,...,...,...,...,...
1754833,20160129.0,2016,1,29,4
1754873,20160321.0,2016,3,21,0
1754877,20160504.0,2016,5,4,2
1754878,20160321.0,2016,3,21,0


知识点4：pandas关于时间的教程https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html

In [20]:
# 统计正负样本的分布
print(train_data['label'].value_counts())

1    57060
0    10105
Name: label, dtype: int64


In [21]:
train_data

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,Discount_type,DiscountCond,DiscountLimit,label,Date_received_year,Date_received_month,Date_received_day,Date_received_weekday,Date_received_dayofyear
6,1439408,2632,8591.0,0.05,0.0,20160516.0,20160613.0,1,20.0,1.0,0,2016,5,16,0,137
33,1113008,1361,11166.0,0.05,0.0,20160515.0,20160521.0,1,20.0,1.0,1,2016,5,15,6,136
38,2881376,8390,7531.0,0.25,0.0,20160321.0,20160329.0,1,20.0,5.0,1,2016,3,21,0,81
69,114747,6901,2366.0,0.166667,0.0,20160523.0,20160605.0,1,30.0,5.0,1,2016,5,23,0,144
75,114747,5341,111.0,0.166667,0.0,20160127.0,20160221.0,1,30.0,5.0,0,2016,1,27,2,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1754833,1437872,7706,416.0,0.1,4.0,20160129.0,20160202.0,1,100.0,10.0,1,2016,1,29,4,29
1754873,212662,2934,5686.0,0.166667,2.0,20160321.0,20160330.0,1,30.0,5.0,1,2016,3,21,0,81
1754877,212662,3021,3739.0,0.0333333,6.0,20160504.0,20160508.0,1,30.0,1.0,1,2016,5,4,2,125
1754878,212662,2934,5686.0,0.166667,2.0,20160321.0,20160322.0,1,30.0,5.0,1,2016,3,21,0,81


Tips:当正负样本比例差距过大时（比如正负样本比例为1:10时），此时的数据集称为非平衡数据集，非平衡数据集会影响我们的模型，
因此需要对数据集进行处理，方法包括：正采样、负采样、数据合成等，具体方法可以参考这篇文章：
https://blog.csdn.net/a8039974/article/details/83684841

# 问题四：对数据集进行划分：80%训练集 、20%测试集

In [22]:
# 划分训练集和验证集，比例分别为0.8、0.2
X_train, X_test, y_train, y_test = train_test_split(train_data[['Merchant_id', #'Coupon_id',	
                                                                'Discount_rate','DiscountCond', 'DiscountLimit',
                                                                'Distance',	#'Date_received','Date',
                                                                'Discount_type',
                                                                'Date_received', 'Date_received_year', 
                                                                'Date_received_month', 'Date_received_day', 
                                                                'Date_received_weekday']], 
                                                    train_data["label"], stratify=train_data["label"],
                                                    test_size=0.2,
                                                    random_state=3)

知识点5：在sklearn中划分数据集有多种方法，比如当采用k折交叉验证时使用KFold函数；当采用留一法时使用LeaveOneOut函数；
更多的数据集划分方法可以参考这篇文章：https://www.cnblogs.com/cmybky/p/11772655.html                      
知识点6:在模型中，我们使用了"User_id"等6个特征，我们使用的6个特征是否对预测我们的label都有意义呢？我们是否能够构造更多有效的特征来纳入模型，以提高我们模型的预测的效果呢？希望同学们能多多思考、多多探索！

In [23]:
# 查验训练样本的数量和类别分布
y_train.value_counts()

1    45648
0     8084
Name: label, dtype: int64

In [24]:
# 查验测试样本的数量和类别分布
y_test.value_counts()

1    11412
0     2021
Name: label, dtype: int64

# 初始化分类决策树模型， 树的深度为5层

In [74]:
model = DecisionTreeClassifier(random_state=1, max_depth=45,class_weight={0:6, 1:1},
                               min_samples_split=3)

sklearn中决策树的使用可以参考sklearn官方文档里面对于决策树的介绍：https://scikit-learn.org/stable/modules/tree.html

# 模型训练

In [75]:
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight={0: 6, 1: 1},
                       criterion='gini', max_depth=45, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=3, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=1, splitter='best')

# 问题五：模型预测

In [76]:
y_pred = model.predict(X_test)

# 模型评估

In [77]:
accuracy_score(y_test, y_pred)

0.7378843147472642

In [78]:
roc_auc_score(y_test, y_pred)

0.7013881409587692

# 将模型选择特征的标准改为entropy 

In [70]:
model = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=45,class_weight={0:6, 1:1},
                               min_samples_split=3)

# 模型训练

In [71]:
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight={0: 6, 1: 1},
                       criterion='entropy', max_depth=45, max_features=None,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=3, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=1, splitter='best')

# 问题五：模型预测

In [72]:
y_pred = model.predict(X_test)

# 评估

In [73]:
accuracy_score(y_test, y_pred)

0.7366932181939998

In [53]:
roc_auc_score(y_test, y_pred)

0.7005118703664103