In [1]:
# import libraries necessary for this project 倒库
import os, sys, pickle
 
import numpy as np
import pandas as pd
 
from datetime import date
 
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
 
# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
#打开数据集
dfoff = pd.read_csv('ccf_offline_stage1_train.csv',keep_default_na=False)

dftest = pd.read_csv('ccf_offline_stage1_test_revised.csv',keep_default_na=False)
 
dfoff.head(5)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0,,20160217.0
1,1439408,4663,11002.0,150:20,1,20160528.0,
2,1439408,2632,8591.0,20:1,0,20160217.0,
3,1439408,2632,1078.0,20:1,0,20160319.0,
4,1439408,2632,8591.0,20:1,0,20160613.0,


In [3]:
#简单的统计 用户使用优惠券的情况
print('有优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] != 'null')].shape[0])
print('有优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] == 'null')].shape[0])
print('无优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] != 'null')].shape[0])
print('无优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] == 'null')].shape[0])

有优惠卷，购买商品：75382
有优惠卷，未购商品：977900
无优惠卷，购买商品：701602
无优惠卷，未购商品：0


In [4]:
dfoff.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0,,20160217.0
1,1439408,4663,11002.0,150:20,1,20160528.0,
2,1439408,2632,8591.0,20:1,0,20160217.0,
3,1439408,2632,1078.0,20:1,0,20160319.0,
4,1439408,2632,8591.0,20:1,0,20160613.0,


In [5]:
print('Discount_rate 类型：\n',dfoff['Discount_rate'].unique()) #输出各式各样的打折形式，用unique可以消除重复的条目

Discount_rate 类型：
 ['null' '150:20' '20:1' '200:20' '30:5' '50:10' '10:5' '100:10' '200:30'
 '20:5' '30:10' '50:5' '150:10' '100:30' '200:50' '100:50' '300:30' '50:20'
 '0.9' '10:1' '30:1' '0.95' '100:5' '5:1' '100:20' '0.8' '50:1' '200:10'
 '300:20' '100:1' '150:30' '300:50' '20:10' '0.85' '0.6' '150:50' '0.75'
 '0.5' '200:5' '0.7' '30:20' '300:10' '0.2' '50:30' '200:100' '150:5']


In [6]:
# Convert Discount_rate and Distance
def getDiscountType(row):#定义打折类型
    if row == 'null':
        return 'null'
    elif ':' in row:#满减
        return 1
    else:
        return 0#折扣率

def convertRate(row): #统一为折扣率
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

In [7]:
def getDiscountMan(row): #返回满多少才可以减的满
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row): #返回满多少才可以减的减多少
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

In [8]:
def processData(df):
    
    # convert discount_rate
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    
    print(df['discount_rate'].unique())
    
    return df

In [9]:
dfoff = processData(dfoff)#打印出打折率
dftest = processData(dftest)

[ 1.          0.86666667  0.95        0.9         0.83333333  0.8         0.5
  0.85        0.75        0.66666667  0.93333333  0.7         0.6
  0.96666667  0.98        0.99        0.975       0.33333333  0.2         0.4       ]
[ 0.83333333  0.9         0.96666667  0.8         0.95        0.75        0.98
  0.5         0.86666667  0.6         0.66666667  0.7         0.85
  0.33333333  0.94        0.93333333  0.975       0.99      ]


In [10]:
dfoff.head(5)#表中增加了4个特征

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,discount_jian
0,1439408,2632,,,0,,20160217.0,,1.0,0,0
1,1439408,4663,11002.0,150:20,1,20160528.0,,1.0,0.866667,150,20
2,1439408,2632,8591.0,20:1,0,20160217.0,,1.0,0.95,20,1
3,1439408,2632,1078.0,20:1,0,20160319.0,,1.0,0.95,20,1
4,1439408,2632,8591.0,20:1,0,20160613.0,,1.0,0.95,20,1


In [11]:
# convert distance
dfoff['distance'] = dfoff['Distance'].replace('null', -1).astype(int)#-1代替null
print(dfoff['distance'].unique())
dftest['distance'] = dftest['Distance'].replace('null', -1).astype(int)
print(dftest['distance'].unique())

[ 0  1 -1  2 10  4  7  9  3  5  6  8]
[ 1 -1  5  2  0 10  3  6  7  4  9  8]


In [12]:
dfoff.head(5)#可以看出增加了distance特征

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,discount_jian,distance
0,1439408,2632,,,0,,20160217.0,,1.0,0,0,0
1,1439408,4663,11002.0,150:20,1,20160528.0,,1.0,0.866667,150,20,1
2,1439408,2632,8591.0,20:1,0,20160217.0,,1.0,0.95,20,1,0
3,1439408,2632,1078.0,20:1,0,20160319.0,,1.0,0.95,20,1,0
4,1439408,2632,8591.0,20:1,0,20160613.0,,1.0,0.95,20,1,0


In [13]:
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[date_received != 'null'])

date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[date_buy != 'null'])

print('优惠卷收到日期从',date_received[0],'到',date_received[-1])
print('消费日期从',date_buy[0],'到',date_buy[-1])

优惠卷收到日期从 20160101 到 20160615
消费日期从 20160101 到 20160630


In [14]:
def getWeekday(row):
    if row == 'null':
        return row
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1

In [15]:
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)

# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6,7] else 0)

In [16]:
# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
#print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

In [17]:
dfoff.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,distance,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,,,0,,20160217.0,,1.0,0,...,0,,0,0,0,0,0,0,0,0
1,1439408,4663,11002.0,150:20,1,20160528.0,,1.0,0.866667,150,...,1,6.0,1,0,0,0,0,0,1,0
2,1439408,2632,8591.0,20:1,0,20160217.0,,1.0,0.95,20,...,0,3.0,0,0,0,1,0,0,0,0
3,1439408,2632,1078.0,20:1,0,20160319.0,,1.0,0.95,20,...,0,6.0,1,0,0,0,0,0,1,0
4,1439408,2632,8591.0,20:1,0,20160613.0,,1.0,0.95,20,...,0,1.0,0,1,0,0,0,0,0,0


In [18]:
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print('共有特征：',len(original_feature),'个')
print(original_feature)

共有特征： 14 个
['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [19]:
def label(row):
    if row['Date_received'] == 'null':
        return -1
    if row['Date'] != 'null':
        td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'): # 规定时间内
            return 1
    return 0

dfoff['label'] = dfoff.apply(label, axis=1)#运行时间比较久

In [20]:
print(dfoff['label'].value_counts())

 0    988887
-1    701602
 1     64395
Name: label, dtype: int64


In [21]:
dfoff.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,label
0,1439408,2632,,,0,,20160217.0,,1.0,0,...,,0,0,0,0,0,0,0,0,-1
1,1439408,4663,11002.0,150:20,1,20160528.0,,1.0,0.866667,150,...,6.0,1,0,0,0,0,0,1,0,0
2,1439408,2632,8591.0,20:1,0,20160217.0,,1.0,0.95,20,...,3.0,0,0,0,1,0,0,0,0,0
3,1439408,2632,1078.0,20:1,0,20160319.0,,1.0,0.95,20,...,6.0,1,0,0,0,0,0,1,0,0
4,1439408,2632,8591.0,20:1,0,20160613.0,,1.0,0.95,20,...,1.0,0,1,0,0,0,0,0,0,0


In [None]:
#建立线性模型

In [22]:
# data split
df = dfoff[dfoff['label'] != -1].copy()  # 有优惠券（1为正样本，0为负样本）
train = df[(df['Date_received'] < '20160516')].copy()  #选择部分数据
valid = df[(df['Date_received'] >= '20160516') & (df['Date_received'] <= '20160615')].copy()
print('Train Set: \n', train['label'].value_counts())
print('Valid Set: \n', valid['label'].value_counts())

Train Set: 
 0    759172
1     41524
Name: label, dtype: int64
Valid Set: 
 0    229715
1     22871
Name: label, dtype: int64


In [23]:
def check_model(data, predictors):#最关键的 模型建立
    
    classifier = lambda: SGDClassifier(#sklearn 机器学习库里面自带的SGDC分类器-->随机梯度
        loss='log',  # loss function: logistic regression 逻辑回归
        penalty='elasticnet', # L1 & L2 弹性方式、两者结合
        fit_intercept=True,  # 是否存在截距，默认存在
        #max_iter=100,  最大的迭代次数 
        shuffle=True,  # Whether or not the training data should be shuffled after each epoch  每次迭代后是否随机打乱
        n_jobs=1, # The number of processors to use 
        class_weight=None) # Weights associated with classes. If not given, all classes are supposed to have weight one.
                           # 同样的权重
        # 管道机制使得参数集在新数据集（比如测试集）上的重复使用，管道机制实现了对全部步骤的流式化封装和管理。
    model = Pipeline(steps=[
        ('ss', StandardScaler()), # transformer
        ('en', classifier())  # estimator
    ])
 
    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]  # 交叉验证时需要
    }
 
    # StratifiedKFold用法类似Kfold，但是他是分层采样，确保训练集，测试集中各类别样本的比例与原始数据集中相同。
    folder = StratifiedKFold(n_splits=3, shuffle=True)
        # Exhaustive search over specified parameter values for an estimator.
    grid_search = GridSearchCV(  # 网格搜索的方式交叉验证，选择最好的超参数
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1,  # -1 means using all processors
        verbose=1)
    grid_search = grid_search.fit(data[predictors],  # 这个地方是要两层中括号
                                  data['label'])
    
    return grid_search

In [31]:
predictors = original_feature#训练模型
model = check_model(train, predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  2.0min finished


In [32]:
#验证

In [46]:
# valid predict 输出概率
y_valid_pred = model.predict_proba(valid[predictors])
# y_train_pred = model.predict_proba(train[predictors])
valid1 = valid.copy()
# train1 = train.copy()
# valid1.head(5)
valid1['pred_prob'] = y_valid_pred[:, 1]  # n*2 矩阵，代表判断为0和1的概率， 此处代表是正样本的概率
# train1['pred_prob'] = y_train_pred[:, 1]
valid1.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,label,pred_prob
1,1439408,4663,11002,150:20,1,20160528,,1,0.866667,150,...,1,0,0,0,0,0,1,0,0,0.019514
4,1439408,2632,8591,20:1,0,20160613,,1,0.95,20,...,0,1,0,0,0,0,0,0,0,0.097862
6,1439408,2632,8591,20:1,0,20160516,20160613.0,1,0.95,20,...,0,1,0,0,0,0,0,0,0,0.097862
9,2029232,450,1532,30:5,0,20160530,,1,0.833333,30,...,0,1,0,0,0,0,0,0,0,0.093885
10,2029232,6459,12737,20:1,0,20160519,,1,0.95,20,...,0,0,0,0,1,0,0,0,0,0.135123


In [None]:
# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1] 
    if len(tmpdf['label'].unique()) != 2: #label只有一类 则跳过
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))

In [None]:
#测试样本的预测

In [40]:
# test prediction for submission
y_test_pred = model.predict_proba(dftest[predictors])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['Probability'] = y_test_pred[:,1]
dftest1.to_csv('submit1.csv', index=False, header=False)
dftest1.head(5)

Unnamed: 0,User_id,Coupon_id,Date_received,Probability
0,4129537,9983,20160712,0.10553
1,6949378,3429,20160706,0.1509
2,2166529,6928,20160727,0.005434
3,2166529,1808,20160727,0.018255
4,6172162,6500,20160708,0.06568


In [None]:
#保存模型 下次再使用方便

In [47]:
if not os.path.isfile('1_model.pkl'):
    with open('1_model.pkl', 'wb') as f:
        pickle.dump(model, f)
else:
    with open('1_model.pkl', 'rb') as f:
        model = pickle.load(f)

In [None]:
#关于优化 特征工程（商户的信息）2算法（svm）3（模型集成bagging）4参考第一名