In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "."

In [2]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]

In [3]:
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
#dfoff.head(20)


(1160742, 7)
(306313, 6)


In [37]:
#dftest.head(20)


In [4]:
dfoff.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,


In [10]:
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [9]:
## test
t = dfoff.apply(label, axis=1)

type(t)

In [20]:
### test
def zx(x):
    print(type(x),x)
zdf = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
zdf.apply(zx,axis = 1)



<class 'pandas.core.series.Series'> A    4
B    9
Name: 0, dtype: int64
<class 'pandas.core.series.Series'> A    4
B    9
Name: 1, dtype: int64
<class 'pandas.core.series.Series'> A    4
B    9
Name: 2, dtype: int64


0    None
1    None
2    None
dtype: object

In [12]:
#debug
#dfoff

In [19]:
#test
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)


In [20]:
#

In [21]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [22]:
dfoff.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0


In [25]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [29]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

In [77]:
dftest.Distance.value_counts()

0.0     135755
1.0      48858
99.0     36177
10.0     22765
2.0      20236
3.0      12870
4.0       9003
5.0       6376
6.0       4905
7.0       3755
8.0       3007
9.0       2606
Name: Distance, dtype: int64

In [30]:
dfoff.to_csv("doff_feature.csv")
dftest.to_csv("dtest_feature.csv")

In [31]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [32]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [34]:
train.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'is_train'],
      dtype='object')

In [48]:
def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss = 'modified_huber',
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=4,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [49]:
predictors = original_feature
print(predictors)

model = check_model(train, predictors)

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   54.3s finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [50]:
def check_result(model,predictors,valid):
    y_valid_pred = model.predict_proba(valid[predictors])
    valid1 = valid.copy()
    valid1['pred_prob'] = y_valid_pred[:, 1]
    from sklearn.metrics import roc_auc_score, accuracy_score
    auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
    acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
    print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

In [51]:
check_result(model, predictors, valid)

Validation AUC: 0.711, Accuracy: 0.952


  Xt = transform.transform(Xt)


In [52]:
def gen_report(dftest,model,predictors,report_csv):
    targetset = dftest.copy()
    print(targetset.shape)
    targetset = targetset[~targetset.Coupon_id.isna()]
    targetset.reset_index(drop=True, inplace=True)
    testset = targetset[predictors].copy()

    y_test_pred = model.predict_proba(testset[predictors])
    test1 = testset.copy()
    test1['pred_prob'] = y_test_pred[:, 1]
    print(test1.shape)
    output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
    print(output.shape)

    output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
    output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
    output.reset_index(drop=True, inplace=True)
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
    out = output.groupby("uid", as_index=False).mean()
    out = out[["uid", "pred_prob"]]
    out.columns = ["uid", "label"]
# out.to_csv("baseline_example.csv", header=["uid", "label"], index=False) # submission format
    out.head()
    
    out.to_csv(report_csv, header=["uid", "label"], index=False) # submission format


In [54]:
gen_report(dftest,model,predictors,"svm.csv")

(306313, 19)
(306313, 15)
(306313, 4)


  Xt = transform.transform(Xt)


In [73]:
from sklearn.ensemble import RandomForestClassifier
def check_model_forest(data, predictors):
    clf = RandomForestClassifier(n_estimators=50, max_depth=10,
                             random_state=0)
    rt = clf.fit(data[predictors], 
                                  data['label'])
    return rt

In [74]:
predictors = original_feature
print(predictors)

model2 = check_model_forest(train, predictors)

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [75]:
check_result(model2, predictors, valid)

Validation AUC: 0.790, Accuracy: 0.952


In [76]:
gen_report(dftest,model2,predictors,"rt.csv")

(306313, 19)
(306313, 15)
(306313, 4)


In [79]:
def feature_check(f,train,valid):
    predictors = f
    print(predictors)

    model2 = check_model_forest(train, predictors)
    check_result(model2, predictors, valid)


In [87]:
f= ['discount_rate', 'discount_type', 'discount_man', 'discount_jian','Distance', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']
feature_check(f, train, valid)

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']
Validation AUC: 0.790, Accuracy: 0.952


In [88]:
train_dist_ = pd.get_dummies(train.Distance)

In [97]:
train_dist = train.copy()
dist_index = list(map(lambda x:"dis_"+x, train_dist_.columns.astype(str).tolist()))

In [100]:
train_dist[dist_index] = train_dist_
train_dist.describe()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Date,label,weekday,weekday_type,weekday_1,...,dis_2.0,dis_3.0,dis_4.0,dis_5.0,dis_6.0,dis_7.0,dis_8.0,dis_9.0,dis_10.0,dis_99.0
count,667753.0,667753.0,667753.0,667753.0,667753.0,38337.0,667753.0,667753.0,667753.0,667753.0,...,667753.0,667753.0,667753.0,667753.0,667753.0,667753.0,667753.0,667753.0,667753.0,667753.0
mean,3689158.0,4123.30348,7116.815391,12.629943,20160200.0,20160270.0,0.048629,4.291174,0.0,0.163909,...,0.081051,0.056229,0.042269,0.033071,0.026598,0.021586,0.018113,0.015345,0.206522,0.092727
std,2125857.0,2310.651767,4274.320549,27.871774,96.05601,95.69394,0.215091,2.12133,0.0,0.370194,...,0.272914,0.230363,0.201202,0.178821,0.160906,0.145327,0.13336,0.122923,0.40481,0.29005
min,4.0,8.0,4.0,0.0,20160100.0,20160100.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1841429.0,2146.0,3200.0,0.0,20160130.0,20160210.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3696404.0,3532.0,7665.0,3.0,20160130.0,20160230.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5533935.0,6424.0,10823.0,10.0,20160220.0,20160330.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7361032.0,8850.0,14045.0,99.0,20160420.0,20160630.0,1.0,7.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [105]:
dftest.Distance.value_counts()

0.0     135755
1.0      48858
99.0     36177
10.0     22765
2.0      20236
3.0      12870
4.0       9003
5.0       6376
6.0       4905
7.0       3755
8.0       3007
9.0       2606
Name: Distance, dtype: int64

In [106]:
valid_dist_ = pd.get_dummies(valid.Distance)
valid_dist = valid.copy()
valid_index = list(map(lambda x:"dis_"+x, valid_dist_.columns.astype(str).tolist()))
valid_dist[valid_index] = valid_dist_

In [114]:
f= ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']
c= f+ valid_index
c

['discount_rate',
 'discount_type',
 'discount_man',
 'discount_jian',
 'weekday_type',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6',
 'weekday_7',
 'dis_0.0',
 'dis_1.0',
 'dis_2.0',
 'dis_3.0',
 'dis_4.0',
 'dis_5.0',
 'dis_6.0',
 'dis_7.0',
 'dis_8.0',
 'dis_9.0',
 'dis_10.0',
 'dis_99.0']

In [115]:
feature_check(c, train_dist, valid_dist)

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7', 'dis_0.0', 'dis_1.0', 'dis_2.0', 'dis_3.0', 'dis_4.0', 'dis_5.0', 'dis_6.0', 'dis_7.0', 'dis_8.0', 'dis_9.0', 'dis_10.0', 'dis_99.0']
Validation AUC: 0.789, Accuracy: 0.952


In [120]:
dftest_dist = dftest.copy()
test_df_dist = pd.get_dummies(dftest_dist.Distance)
dftest_dist[valid_index] = test_df_dist


In [121]:
model2 = check_model_forest(train_dist, c)
check_result(model2, c, valid_dist)

gen_report(dftest_dist,model2,c,"rt_dist.csv")

Validation AUC: 0.789, Accuracy: 0.952
(306313, 31)
(306313, 25)
(306313, 4)


In [None]:
dftest.Distance.value_counts()

In [None]:
from sklearn.svm import SVC
def check_svc(data, predictors):
    clf = SVC()
    clf = SVC(probability=True)
    rt = clf.fit(data[predictors], 
                                  data['label'])
    return rt
mc = check_svc(train_dist, c)




In [125]:
check_result(mc, c, valid_dist)

AttributeError: predict_proba is not available when  probability=False