In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier               
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from mlxtend.regressor import StackingRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
DATA_ROOT = "ml100marathon-02-01/"
dftrain = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
dftotal = pd.concat([dftrain, dftest])
print(dftrain.shape)
print(dftest.shape)
print(dftotal.shape)
# print(dfoff.head(20))

(1160742, 7)
(306313, 6)
(1467055, 7)


In [5]:
def CheckMissingVals(data):
    for col in data.columns:
        if np.sum(data[col].isnull()) != 0:
            print(f' Missing values in {col} : {np.sum(data[col].isnull())}')

print("Count of missing data in training dataset: ")
CheckMissingVals(dftrain)
print('\n')
print("Count of missing data in testing dataset: ")
CheckMissingVals(dftest)
print('\n')
print("Count of missing data in total dataset: ")
CheckMissingVals(dftotal)

Count of missing data in training dataset: 
 Missing values in Coupon_id : 413773
 Missing values in Discount_rate : 413773


 Missing values in Distance : 69826
 Missing values in Date_received : 413773
 Missing values in Date : 704033


Count of missing data in testing dataset: 
 Missing values in Distance : 36177


Count of missing data in total dataset: 
 Missing values in Coupon_id : 413773


 Missing values in Date : 1010346
 Missing values in Date_received : 413773


 Missing values in Discount_rate : 413773
 Missing values in Distance : 106003


In [6]:
DistanceFilling_UM = dftotal.groupby(['User_id','Merchant_id'])['Distance'].mean().reset_index()
DistanceFilling_UM.columns = ['User_id','Merchant_id','DistanceFilling_UM']
DistanceFilling_U = dftotal.groupby(['User_id'])['Distance'].mean().reset_index()
DistanceFilling_U.columns = ['User_id','DistanceFilling_U']
DistanceFilling_M = dftotal.groupby(['Merchant_id'])['Distance'].mean().reset_index()
DistanceFilling_M.columns = ['Merchant_id','DistanceFilling_M']

dftotal = pd.merge(dftotal,DistanceFilling_UM,on = ['User_id','Merchant_id'], how = 'left')
dftotal = pd.merge(dftotal,DistanceFilling_U,on = ['User_id'], how = 'left')
dftotal = pd.merge(dftotal,DistanceFilling_M,on = ['Merchant_id'], how = 'left')
print(dftotal.head(20))

    Coupon_id        Date  Date_received Discount_rate  Distance  Merchant_id  \
0         NaN  20160217.0            NaN           NaN       0.0         2632   
1      8591.0         NaN     20160217.0          20:1       0.0         2632   
2      1078.0         NaN     20160319.0          20:1       0.0         2632   
3      7610.0         NaN     20160429.0        200:20       0.0         3381   
4     11951.0         NaN     20160129.0        200:20       1.0         3381   
5      9776.0         NaN     20160129.0          10:5       2.0         3381   
6     12034.0         NaN     20160207.0        100:10       NaN         2099   
7      5054.0         NaN     20160421.0        200:30      10.0         1569   
8      7802.0         NaN     20160130.0        200:20      10.0         4833   
9      7610.0         NaN     20160412.0        200:20       2.0         3381   
10        NaN  20160327.0            NaN           NaN       0.0         8390   
11     7531.0         NaN   

In [7]:
def DistanceMissingFill(data):
    if np.isnan(data['Distance']):
        if not np.isnan(data['DistanceFilling_UM']):
            return int(data['DistanceFilling_UM'])
        elif not np.isnan(data['DistanceFilling_U']):
            return int(data['DistanceFilling_U'])
        elif not  np.isnan(data['DistanceFilling_M']):
            return int(data['DistanceFilling_M'])
    return data['Distance']

dftotal['Distance'] = dftotal.apply(DistanceMissingFill, axis = 1)

In [9]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dftotal["label"] = dftotal.apply(label, axis=1)
dftotal["label"].value_counts()

 0    1016978
-1     413773
 1      36304
Name: label, dtype: int64

In [13]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

# dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
# dftest['weekday'] = dftest['Date_received'].apply(getWeekday)
# 
# # weekday_type (weekend = 1)
# dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
# dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

dftotal['weekday'] = dftotal['Date_received'].apply(getWeekday)
dftotal['weekday_type'] = dftotal['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 )

In [14]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

# tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))5
# tmpdf.columns = weekdaycols
# dfoff[weekdaycols] = tmpdf
# 
# tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
# tmpdf.columns = weekdaycols
# dftest[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftotal['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftotal[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [15]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

# dfoff = processData(dfoff)
# dftest = processData(dftest)
dftotal = processData(dftotal)

In [22]:
# dftotal = dftotal.drop(['DistanceFilling_UM','DistanceFilling_U','DistanceFilling_M'], axis = 1)
dftrain = dftotal[:len(dftrain)]
dftest = dftotal[len(dftrain):].drop(['Date'], axis = 1)

print("Count of missing data in training dataset: ")
CheckMissingVals(dftrain)
print('\n')
print("Count of missing data in testing dataset: ")
CheckMissingVals(dftest)

print('\n')
print("Shape of training set -", dftrain.shape)
print("Shape of testing set -", dftest.shape)
# print(dftrain.shape)
# print(dftest.shape)

Count of missing data in training dataset: 
 Missing values in Coupon_id : 413773
 Missing values in Date : 704033
 Missing values in Date_received : 413773


 Missing values in Discount_rate : 413773
 Missing values in weekday : 413773


 Missing values in discount_rate : 413773


Count of missing data in testing dataset: 


Shape of training set - (1160742, 21)
Shape of testing set - (306313, 20)


In [24]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dftrain[dftrain['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

In [27]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)
print(dftest.head())

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']
         Coupon_id  Date_received Discount_rate  Distance  Merchant_id  \
1160742    11002.0     20160528.0        150:20       1.0         4663   
1160743     8591.0     20160613.0          20:1       0.0         2632   
1160744     8591.0     20160516.0          20:1       0.0         2632   
1160745     1532.0     20160530.0          30:5       0.0          450   
1160746    12737.0     20160519.0          20:1       0.0         6459   

         User_id  label  weekday  weekday_type  weekday_1  weekday_2  \
1160742  1439408      0      6.0             0          0          0   
1160743  1439408      0      1.0             0          1          0   
1160744  1439408      0      1.0             0          1          0   
1160745  2029232      0      1.0             0          1         

In [30]:
# predictors = original_feature
# print(predictors)
# 
# def check_model(data, predictors):
#     
#     # classifier = lambda: SGDClassifier(
#     #     loss='log', 
#     #     penalty='elasticnet', 
#     #     fit_intercept=True, 
#     #     max_iter=100, 
#     #     shuffle=True, 
#     #     n_jobs=1,
#     #     class_weight=None)
#     
#     classifier = lambda: RandomForestClassifier(
#         n_jobs=1,
#         class_weight=None)
# 
#     model = Pipeline(steps=[
#         ('ss', StandardScaler()),
#         ('en', classifier())
#     ])
# 
#     # parameters = [{
#     #     'en__alpha': [ 0.001, 0.01, 0.1],
#     #     'en__l1_ratio': [ 0.001, 0.01, 0.1]
#     # },
#     # {
#     #     'bootstrap': [False],
#     #     'en__alpha': [ 0.001, 0.01, 0.1],
#     #     'en__l1_ratio': [ 0.001, 0.01, 0.1]
#     # }]
#     
#     parameters = [
#     {
#         'bootstrap': [False]
#     }]
# 
#     folder = StratifiedKFold(n_splits=3, shuffle=True)
#     
#     grid_search = GridSearchCV(
#         model, 
#         parameters, 
#         cv=folder, 
#         n_jobs=-1, 
#         verbose=1)
#     grid_search = grid_search.fit(data[predictors], 
#                                   data['label'])
#     
#     return grid_search

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [31]:
# model = check_model(train, predictors)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


ValueError: Invalid parameter bootstrap for estimator Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('en', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
          ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [33]:
# # RandomForestClassifier
# predictors = original_feature
# print(predictors)
# param_grid = [
#         {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
#         {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
# ]
# forest_reg = RandomForestClassifier()
# model = GridSearchCV(forest_reg, param_grid, cv=5,
#                             scoring='neg_mean_squared_error',
#                             return_train_score=True)
# model.fit(train[predictors], train['label'])

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [45]:
# XGBClassifier
predictors = original_feature
print(predictors)
param_grid = [
        {'n_estimators': [3, 10, 30], 
         'max_features': [8, 10, 12, 14], 
         'learning_rate': [0.01, 0.02, 0.03], 
         'max_depth': [4, 5, 6]}
]
xgb_model = XGBClassifier()
model = GridSearchCV(xgb_model, 
                     param_grid, 
                     cv=5,
                     scoring='roc_auc',
                     return_train_score=True)
model.fit(train[predictors], train['label'])

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [36]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

In [37]:
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.771, Accuracy: 0.951


In [39]:
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 20)


(306313, 15)


In [40]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [42]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("homework/midterm_submission.csv", header=["uid", "label"], index=False) # submission format
print(out.head())

                     uid     label
0  1000020_2705_20160519  0.216529
1  1000020_8192_20160513  0.204398
2  1000065_1455_20160527  0.098958
3  1000085_8067_20160513  0.136849
4  1000086_2418_20160613  0.157008
