In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

In [2]:
data=pd.read_csv('O2O数据.csv',
           parse_dates=['Date_received','Date'])

In [3]:
# 先假定所有用户都是普通用户
## 没有领优惠券，过来消费： 普通用户（-1）
data['label']=-1
## 领了优惠券，15天以内过来消费：正（1）
true_index=(data['Date']-data['Date_received']).dt.days<=15
data.loc[true_index,'label']=1
## 领了优惠券，领了优惠券没消费：负（0）
False_index2=(data['Date_received'].notnull()&data['Date'].isnull())
## 领了优惠券，过了15天后来消费：负（0）
False_index1=(data['Date']-data['Date_received']).dt.days>15
data.loc[(False_index1|False_index2),'label']=0

In [4]:
# 用户距离存在空值
data['Distance'].isnull().sum()
# 对空值进行填充
#例1：直接对控制进行赋值
data.loc[data['Distance'].isnull(),'Distance']=data['Distance'].mean()
#例2：利用函数填充一个数
data['Distance'].fillna(value=data['Distance'].mean())
#例3：利用相邻数据进行填充
data['Distance'].fillna(method='ffill',  # bfill
                       inplace=False)

0          0.0
1          1.0
2          0.0
3          0.0
4          0.0
          ... 
1048570    1.0
1048571    1.0
1048572    1.0
1048573    1.0
1048574    1.0
Name: Distance, Length: 1048575, dtype: float64

In [5]:
# 两种优惠券变换
def fun(i):
    if ':' in str(i):
        list1=str(i).split(':')
        return 1-int(list1[1])/int(list1[0])
    elif '.' in str(i):
        return float(i)
    else:
        return 1
data['Discount_rate_new']=data['Discount_rate'].apply(fun)

In [6]:
coupon_popu=data[['Coupon_id','label']].groupby('Coupon_id').agg(lambda x: sum(x==1)/len(x))
coupon_popu.columns=['coupon_popu']
# 把求的结果拼回去
data=pd.merge(data,coupon_popu,
             left_on='Coupon_id',right_index=True,
             how='left')

In [7]:
user_get=data[['User_id','Coupon_id']].groupby('User_id').agg(lambda x: sum(x.notnull()))
user_get.columns=['user_get']
data=pd.merge(data,user_get,
             left_on='User_id',right_index=True,
             how='left')

### 作业1：统计各用户 label 为1的数量

In [8]:
user_label1=data[['User_id','label']].groupby('User_id').agg(lambda x: sum(x==1))
user_label1.columns=['user_label1']
data=pd.merge(data,user_label1,
             left_on='User_id',right_index=True,
             how='left')
user_label1

Unnamed: 0_level_0,user_label1
User_id,Unnamed: 1_level_1
4,0
35,0
64,0
144,0
147,0
...,...
7360928,1
7360952,0
7360961,0
7360966,0


In [9]:
merchant_popu=data[['Merchant_id','Coupon_id','label']].groupby('Merchant_id').agg({
                                            'label':lambda x:sum(x==1),
                                            'Coupon_id':lambda x: sum(x.notnull())})
merchant_popu['merchant_popu']=merchant_popu['label']/merchant_popu['Coupon_id']
data=pd.merge(data,merchant_popu[['merchant_popu']],
             left_on='Merchant_id',right_index=True,
             how='left')

In [10]:
UM_used=data[['User_id','Merchant_id','label']].groupby(['User_id','Merchant_id']).agg(
                                                            lambda x: sum(x==1))
UM_used.columns=['UM_used']
data=pd.merge(data,UM_used,
             left_on=['User_id','Merchant_id'],
            right_index=True,
             how='left')

### 作业2：用户在商家领取的优惠券数

In [11]:
data

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu,user_get,user_label1,merchant_popu,UM_used
0,1439408,2632,,,0.0,NaT,2016-02-17,-1,1.000000,,5,0,0.000000,0
1,1439408,4663,11002.0,150:20:00,1.0,2016-05-28,NaT,0,0.866667,0.001727,5,0,0.004378,0
2,1439408,2632,8591.0,20:01,0.0,2016-02-17,NaT,0,0.950000,0.000000,5,0,0.000000,0
3,1439408,2632,1078.0,20:01,0.0,2016-03-19,NaT,0,0.950000,0.000000,5,0,0.000000,0
4,1439408,2632,8591.0,20:01,0.0,2016-06-13,NaT,0,0.950000,0.000000,5,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,5186127,4043,3996.0,30:05:00,1.0,2016-02-03,2016-02-10,1,0.833333,0.222857,2,1,0.045272,1
1048571,5186127,4043,3996.0,30:05:00,1.0,2016-02-12,NaT,0,0.833333,0.222857,2,1,0.045272,1
1048572,5186127,4043,,,1.0,NaT,2016-05-30,-1,1.000000,,2,1,0.045272,1
1048573,5186127,4043,,,1.0,NaT,2016-05-01,-1,1.000000,,2,1,0.045272,1


In [12]:
user_M_get=data[['User_id','Merchant_id','label']].groupby(['User_id','Merchant_id']).count()
user_M_get

Unnamed: 0_level_0,Unnamed: 1_level_0,label
User_id,Merchant_id,Unnamed: 2_level_1
4,1433,1
4,1469,1
35,3381,4
64,2146,1
144,1553,1
...,...,...
7360928,3532,7
7360952,3403,1
7360961,3621,1
7360966,3381,1


In [13]:
user_M_get.columns=['user_M_get']
data=pd.merge(data,user_M_get,
             left_on=['User_id','Merchant_id'],right_index=True,
             how='left')

### 作业3：用户在商家消费的次数

In [14]:
data['labelX']=0
## 普通消费日期
False_index2X=(data['Date'].notnull()&data['Coupon_id'].isnull())
## 用优惠券消费日期
False_index1X=(data['Date'].notnull()&data['Coupon_id'].notnull())
data.loc[(False_index1X|False_index2X),'labelX']=1

In [15]:
user_M_C=data[['User_id','Merchant_id','labelX']].groupby(['User_id','Merchant_id']).agg(lambda x: sum(x==1))
user_M_C

Unnamed: 0_level_0,Unnamed: 1_level_0,labelX
User_id,Merchant_id,Unnamed: 2_level_1
4,1433,0
4,1469,0
35,3381,0
64,2146,0
144,1553,0
...,...,...
7360928,3532,6
7360952,3403,0
7360961,3621,0
7360966,3381,0


In [16]:
user_M_C.columns=['user_M_C']
data=pd.merge(data,user_M_C,
             left_on=['User_id','Merchant_id'],right_index=True,
             how='left')

In [17]:
data

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu,user_get,user_label1,merchant_popu,UM_used,user_M_get,labelX,user_M_C
0,1439408,2632,,,0.0,NaT,2016-02-17,-1,1.000000,,5,0,0.000000,0,6,1,3
1,1439408,4663,11002.0,150:20:00,1.0,2016-05-28,NaT,0,0.866667,0.001727,5,0,0.004378,0,1,0,0
2,1439408,2632,8591.0,20:01,0.0,2016-02-17,NaT,0,0.950000,0.000000,5,0,0.000000,0,6,0,3
3,1439408,2632,1078.0,20:01,0.0,2016-03-19,NaT,0,0.950000,0.000000,5,0,0.000000,0,6,0,3
4,1439408,2632,8591.0,20:01,0.0,2016-06-13,NaT,0,0.950000,0.000000,5,0,0.000000,0,6,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,5186127,4043,3996.0,30:05:00,1.0,2016-02-03,2016-02-10,1,0.833333,0.222857,2,1,0.045272,1,6,1,5
1048571,5186127,4043,3996.0,30:05:00,1.0,2016-02-12,NaT,0,0.833333,0.222857,2,1,0.045272,1,6,0,5
1048572,5186127,4043,,,1.0,NaT,2016-05-30,-1,1.000000,,2,1,0.045272,1,6,1,5
1048573,5186127,4043,,,1.0,NaT,2016-05-01,-1,1.000000,,2,1,0.045272,1,6,1,5


In [18]:
data['label'].value_counts()
n=10
data_T=data[data['label']==1].sample(n)

### 抽样负样本

In [41]:
data_F=data[data['label']==0].sample(n)
data_F

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu,user_get,user_label1,merchant_popu,UM_used,user_M_get,labelX,user_M_C
788480,2941104,450,8555.0,30:05:00,4.0,2016-01-27,NaT,0,0.833333,0.004062,4,0,0.016409,0,1,0,0
638326,7200632,5717,8192.0,20:05,10.0,2016-05-28,NaT,0,0.75,0.012409,1,0,0.013521,0,1,0,0
511073,6989107,2709,2840.0,100:10:00,6.0,2016-01-28,NaT,0,0.9,0.001546,5,0,0.003169,0,1,0,0
143500,895251,760,2418.0,30:05:00,2.0,2016-06-05,NaT,0,0.833333,0.038473,3,0,0.056495,0,1,0,0
992420,4880542,2934,5686.0,30:05:00,3.0,2016-03-23,NaT,0,0.833333,0.157312,3,0,0.157312,0,4,0,3
638351,3078588,3381,1807.0,300:30:00,2.0,2016-01-26,NaT,0,0.9,0.004864,3,0,0.011912,0,2,0,1
38263,2213792,6485,9407.0,100:10:00,1.0,2016-02-03,NaT,0,0.9,0.00931,5,0,0.233439,0,2,0,1
922860,7081654,913,9759.0,20:01,0.0,2016-05-20,NaT,0,0.95,0.257143,2,0,0.259259,0,4,0,2
1035038,7232962,5341,111.0,30:05:00,0.0,2016-02-04,NaT,0,0.833333,0.056352,1,0,0.115744,0,1,0,0
927951,3547501,2146,11173.0,100:10:00,3.0,2016-01-29,NaT,0,0.9,0.002488,2,0,0.003152,0,1,0,0


### 合并抽样的正负样本

In [42]:
data_clear=pd.concat([data_T,data_F])
data_clear

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu,user_get,user_label1,merchant_popu,UM_used,user_M_get,labelX,user_M_C
775445,811687,2934,5686.0,30:05:00,8.0,2016-03-22,2016-03-27,1,0.833333,0.157312,2,1,0.157312,1,2,1,2
510831,4646191,4579,1708.0,20:05,1.0,2016-03-22,2016-03-24,1,0.75,0.271845,7,3,0.094308,3,41,1,41
793160,3852984,7555,9871.0,30:05:00,8.0,2016-04-06,2016-04-09,1,0.833333,0.019321,3,1,0.044739,1,3,1,3
967979,4694002,1631,13663.0,20:05,2.0,2016-03-27,2016-04-02,1,0.75,0.218182,1,1,0.218182,1,2,1,2
666165,1843189,5265,6117.0,100:20:00,2.362526,2016-05-07,2016-05-07,1,0.8,0.5,1,1,0.117647,1,1,1,1
335392,6672026,1169,4627.0,50:20:00,0.0,2016-02-16,2016-02-16,1,0.6,0.382239,1,1,0.061086,1,1,1,1
1011933,4429090,5341,111.0,30:05:00,0.0,2016-01-31,2016-02-04,1,0.833333,0.056352,8,1,0.115744,1,3,1,1
294371,4666565,3403,11214.0,20:05,0.0,2016-06-01,2016-06-06,1,0.75,0.3,4,2,0.372385,2,14,1,14
906843,2597665,6885,3105.0,10:01,1.0,2016-01-10,2016-01-22,1,0.9,0.222222,6,2,0.164835,2,6,1,3
187626,2670230,8181,1560.0,20:05,0.0,2016-03-26,2016-04-02,1,0.75,0.317308,5,1,0.068085,1,5,1,2


### 预测

In [43]:
data_clear.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label', 'Discount_rate_new', 'coupon_popu',
       'user_get', 'user_label1', 'merchant_popu', 'UM_used', 'user_M_get',
       'labelX', 'user_M_C'],
      dtype='object')

In [44]:
X = data_clear[['Distance','Discount_rate_new','coupon_popu','user_get','user_label1','merchant_popu','UM_used','user_M_get','user_M_C']]
Y = data_clear['label']

# 数据标准化

In [45]:
from sklearn.preprocessing import Normalizer
X = Normalizer().fit_transform(X)

In [46]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,)

In [47]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train,Y_train)

DecisionTreeClassifier()

In [48]:
from sklearn.metrics import roc_auc_score,confusion_matrix,classification_report
pre = model.predict_proba(X_test)
auc = roc_auc_score(Y_test,pre[:,1])
print(auc)
pre = model.predict(X_test)
con_mat = confusion_matrix(Y_test,pre)
print(con_mat)
print(classification_report(Y_test,pre))

1.0
[[1 0]
 [0 3]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         3

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

