In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

# 读数据

In [2]:
data=pd.read_csv('ccf_offline_stage1_train_test.csv',
           parse_dates=['Date_received','Date'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7142 entries, 0 to 7141
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   User_id        7142 non-null   int64         
 1   Merchant_id    7142 non-null   int64         
 2   Coupon_id      4315 non-null   float64       
 3   Discount_rate  4315 non-null   object        
 4   Distance       6742 non-null   float64       
 5   Date_received  4315 non-null   datetime64[ns]
 6   Date           3144 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(2), object(1)
memory usage: 390.7+ KB


In [3]:
data.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,NaT,2016-02-17
1,1439408,4663,11002.0,150:20:00,1.0,2016-05-28,NaT
2,1439408,2632,8591.0,20:01,0.0,2016-02-17,NaT
3,1439408,2632,1078.0,20:01,0.0,2016-03-19,NaT
4,1439408,2632,8591.0,20:01,0.0,2016-06-13,NaT


# 加标签

In [4]:
# 先假定所有用户都是普通用户
## 没有领优惠券，过来消费： 普通用户（-1）
data['label']=-1
## 领了优惠券，15天以内过来消费：正（1）
true_index=(data['Date']-data['Date_received']).dt.days<=15
data.loc[true_index,'label']=1
## 领了优惠券，领了优惠券没消费：负（0）
False_index2=(data['Date_received'].notnull()&data['Date'].isnull())
## 领了优惠券，过了15天后来消费：负（0）
False_index1=(data['Date']-data['Date_received']).dt.days>15
data.loc[(False_index1|False_index2),'label']=0

data['label'].value_counts()

 0    4052
-1    2827
 1     263
Name: label, dtype: int64

# 构造特征

## 店铺距离

In [5]:
# 用户距离存在空值
data['Distance'].isnull().sum()
# 对空值进行填充
#例1：直接对控制进行赋值
data.loc[data['Distance'].isnull(),'Distance']=data['Distance'].mean()
#例2：利用函数填充一个数
data['Distance'].fillna(value=data['Distance'].mean())
#例3：利用相邻数据进行填充
data['Distance'].fillna(method='ffill',  # bfill
                       inplace=False)

0       0.0
1       1.0
2       0.0
3       0.0
4       0.0
       ... 
7137    4.0
7138    4.0
7139    4.0
7140    4.0
7141    4.0
Name: Distance, Length: 7142, dtype: float64

## 折扣力度

In [6]:
# data['Discount_rate'].value_counts()

# 区分两种优惠券和没有打折的订单
def fun(i):
    if ':' in str(i):
        list1=str(i).split(':')
        return 1-int(list1[1])/int(list1[0])
    elif '.' in str(i):
        return float(i)
    else:
        return 1
data['Discount_rate_new']=data['Discount_rate'].apply(fun)
data['Discount_rate_new']

0       1.000000
1       0.866667
2       0.950000
3       0.950000
4       0.950000
          ...   
7137    1.000000
7138    1.000000
7139    1.000000
7140    1.000000
7141    1.000000
Name: Discount_rate_new, Length: 7142, dtype: float64

## 优惠券的流行程度
+ 优惠券被用掉的数/优惠券发出去的数

In [7]:
# # 优惠券被用掉的数
# ans1=data[['Coupon_id','label']].groupby('Coupon_id').agg(lambda x: sum(x==1))
# # 优惠券发出去的数
# ans2=data[['Coupon_id','label']].groupby('Coupon_id').count()
# 综合写法
coupon_popu=data[['Coupon_id','label']].groupby('Coupon_id').agg(lambda x: sum(x==1)/len(x))
coupon_popu.columns=['coupon_popu']
# 把求的结果拼回去
data=pd.merge(data,coupon_popu,
             left_on='Coupon_id',right_index=True,
             how='left')
data

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu
0,1439408,2632,,,0.0,NaT,2016-02-17,-1,1.000000,
1,1439408,4663,11002.0,150:20:00,1.0,2016-05-28,NaT,0,0.866667,0.0
2,1439408,2632,8591.0,20:01,0.0,2016-02-17,NaT,0,0.950000,0.0
3,1439408,2632,1078.0,20:01,0.0,2016-03-19,NaT,0,0.950000,0.0
4,1439408,2632,8591.0,20:01,0.0,2016-06-13,NaT,0,0.950000,0.0
...,...,...,...,...,...,...,...,...,...,...
7137,5515992,5393,,,4.0,NaT,2016-05-09,-1,1.000000,
7138,5515992,5393,,,4.0,NaT,2016-06-23,-1,1.000000,
7139,5515992,5393,,,4.0,NaT,2016-06-29,-1,1.000000,
7140,5515992,5393,,,4.0,NaT,2016-04-11,-1,1.000000,


## 用户相关特征

### 用户领取的优惠券数量 
+ 用户对应优惠券不为空的数数量

In [8]:
user_get=data[['User_id','Coupon_id']].groupby('User_id').agg(lambda x: sum(x.notnull()))
user_get.columns=['user_get']
data=pd.merge(data,user_get,
             left_on='User_id',right_index=True,
             how='left')
data

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu,user_get
0,1439408,2632,,,0.0,NaT,2016-02-17,-1,1.000000,,5
1,1439408,4663,11002.0,150:20:00,1.0,2016-05-28,NaT,0,0.866667,0.0,5
2,1439408,2632,8591.0,20:01,0.0,2016-02-17,NaT,0,0.950000,0.0,5
3,1439408,2632,1078.0,20:01,0.0,2016-03-19,NaT,0,0.950000,0.0,5
4,1439408,2632,8591.0,20:01,0.0,2016-06-13,NaT,0,0.950000,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...
7137,5515992,5393,,,4.0,NaT,2016-05-09,-1,1.000000,,0
7138,5515992,5393,,,4.0,NaT,2016-06-23,-1,1.000000,,0
7139,5515992,5393,,,4.0,NaT,2016-06-29,-1,1.000000,,0
7140,5515992,5393,,,4.0,NaT,2016-04-11,-1,1.000000,,0


### 用户消费过的优惠券数量
+ 统计各用户 label 为1的数量
+ 作业1

## 商家优惠券的流行程度
+ 商家发出去被用掉的数/商家发出去的优惠券数

In [9]:
merchat_popu=data[['Merchant_id','Coupon_id','label']].groupby('Merchant_id').agg({
                                            'label':lambda x:sum(x==1),
                                            'Coupon_id':lambda x: sum(x.notnull())})
merchat_popu['merchat_popu']=merchat_popu['label']/merchat_popu['Coupon_id']
data=pd.merge(data,merchat_popu[['merchat_popu']],
             left_on='Merchant_id',right_index=True,
             how='left')
data

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu,user_get,merchat_popu
0,1439408,2632,,,0.0,NaT,2016-02-17,-1,1.000000,,5,0.0
1,1439408,4663,11002.0,150:20:00,1.0,2016-05-28,NaT,0,0.866667,0.0,5,0.0
2,1439408,2632,8591.0,20:01,0.0,2016-02-17,NaT,0,0.950000,0.0,5,0.0
3,1439408,2632,1078.0,20:01,0.0,2016-03-19,NaT,0,0.950000,0.0,5,0.0
4,1439408,2632,8591.0,20:01,0.0,2016-06-13,NaT,0,0.950000,0.0,5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7137,5515992,5393,,,4.0,NaT,2016-05-09,-1,1.000000,,0,
7138,5515992,5393,,,4.0,NaT,2016-06-23,-1,1.000000,,0,
7139,5515992,5393,,,4.0,NaT,2016-06-29,-1,1.000000,,0,
7140,5515992,5393,,,4.0,NaT,2016-04-11,-1,1.000000,,0,


## 交互关系

### 用户在商家使用优惠券的次数

In [15]:
UM_used=data[['User_id','Merchant_id','label']].groupby(['User_id','Merchant_id']).agg(
                                                            lambda x: sum(x==1))
UM_used.columns=['UM_used']
data=pd.merge(data,UM_used,
             left_on=['User_id','Merchant_id'],
            right_index=True,
             how='left')
data

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu,user_get,merchat_popu,UM_used
0,1439408,2632,,,0.0,NaT,2016-02-17,-1,1.000000,,5,0.0,0
1,1439408,4663,11002.0,150:20:00,1.0,2016-05-28,NaT,0,0.866667,0.0,5,0.0,0
2,1439408,2632,8591.0,20:01,0.0,2016-02-17,NaT,0,0.950000,0.0,5,0.0,0
3,1439408,2632,1078.0,20:01,0.0,2016-03-19,NaT,0,0.950000,0.0,5,0.0,0
4,1439408,2632,8591.0,20:01,0.0,2016-06-13,NaT,0,0.950000,0.0,5,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7137,5515992,5393,,,4.0,NaT,2016-05-09,-1,1.000000,,0,,0
7138,5515992,5393,,,4.0,NaT,2016-06-23,-1,1.000000,,0,,0
7139,5515992,5393,,,4.0,NaT,2016-06-29,-1,1.000000,,0,,0
7140,5515992,5393,,,4.0,NaT,2016-04-11,-1,1.000000,,0,,0


### 用户在商家领取的优惠券数
+ 作业2

### 用户在商家消费的次数
+ 作业3


# 样本数均衡

In [21]:
data['label'].value_counts()

 0    4052
-1    2827
 1     263
Name: label, dtype: int64

## 提取正样本均衡

In [23]:
n=10
data_T=data[data['label']==1].sample(n)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,Discount_rate_new,coupon_popu,user_get,merchat_popu,UM_used
1898,5931720,1379,1579.0,30:05:00,0.0,2016-03-25,2016-03-26,1,0.833333,0.214286,3,0.1875,1
1587,2793314,904,7534.0,20:05,0.0,2016-03-24,2016-04-01,1,0.75,1.0,15,1.0,2
4342,473197,6485,2079.0,30:05:00,0.0,2016-05-21,2016-05-25,1,0.833333,0.266667,2,0.3,1
5450,4592340,760,2418.0,30:05:00,2.0,2016-06-15,2016-06-18,1,0.833333,0.03937,2,0.060914,1
561,3045728,3293,11364.0,30:05:00,0.0,2016-02-28,2016-03-05,1,0.833333,0.5,3,0.5,2
3113,1758899,7555,5582.0,30:05:00,0.0,2016-01-30,2016-02-07,1,0.833333,0.583333,13,0.085938,5
4408,392173,6901,2366.0,30:05:00,2.0,2016-05-03,2016-05-15,1,0.833333,0.065217,2,0.054545,1
4266,5195472,450,13295.0,20:05,0.0,2016-05-13,2016-05-17,1,0.75,0.5,7,0.026923,4
3385,3028838,2436,3992.0,30:05:00,0.0,2016-05-15,2016-05-19,1,0.833333,0.041667,3,0.034483,1
3101,2337638,7838,12429.0,0.95,1.0,2016-04-11,2016-04-11,1,0.95,0.5,11,0.2,1


## 抽样负样本

## 合并抽样的正负样本

# 建模预测

## 提取特征
+ X(9个)和Y

## 数据标准化

## 调用模型等……