In [5]:
import os
import numpy as np
import pandas as pd
import datetime
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_path = r"C:\Users\user\Documents\GitHub\2nd-ML100Days\homework\Day_051_HW.ipynb\ml100marathon-02-01"

## 在字串前加上 r或 R表示該字串是非轉義的原始字串。
## 在字串前加上 u或 U表示該字串是unicode字串。

In [6]:
dfoff = pd.read_csv(os.path.join(DATA_path,'train_offline.csv'))
#dfoff = dfoff.iloc[:5000]
dftest = pd.read_csv(os.path.join(DATA_path,'test_offline.csv'))
#dftest = dftest.iloc[:5000]
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(5)

(1160742, 7)
(306313, 9)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,


In [7]:
DistanceFilling_UM = dfoff.groupby(['User_id','Merchant_id'])['Distance'].mean().reset_index()
DistanceFilling_UM.columns = ['User_id','Merchant_id','DistanceFilling_UM']
DistanceFilling_U = dfoff.groupby(['User_id'])['Distance'].mean().reset_index()
DistanceFilling_U.columns = ['User_id','DistanceFilling_U']
DistanceFilling_M = dfoff.groupby(['Merchant_id'])['Distance'].mean().reset_index()
DistanceFilling_M.columns = ['Merchant_id','DistanceFilling_M']

DistanceFilling_UM = dftest.groupby(['User_id','Merchant_id'])['Distance'].mean().reset_index()
DistanceFilling_UM.columns = ['User_id','Merchant_id','DistanceFilling_UM']
DistanceFilling_U = dftest.groupby(['User_id'])['Distance'].mean().reset_index()
DistanceFilling_U.columns = ['User_id','DistanceFilling_U']
DistanceFilling_M = dftest.groupby(['Merchant_id'])['Distance'].mean().reset_index()
DistanceFilling_M.columns = ['Merchant_id','DistanceFilling_M']

dfoff = pd.merge(dfoff,DistanceFilling_UM,on = ['User_id','Merchant_id'], how = 'left')
dfoff = pd.merge(dfoff,DistanceFilling_U,on = ['User_id'], how = 'left')
dfoff = pd.merge(dfoff,DistanceFilling_M,on = ['Merchant_id'], how = 'left')


dftest = pd.merge(dftest,DistanceFilling_UM,on = ['User_id','Merchant_id'], how = 'left')
dftest = pd.merge(dftest,DistanceFilling_U,on = ['User_id'], how = 'left')
dftest = pd.merge(dftest,DistanceFilling_M,on = ['Merchant_id'], how = 'left')


In [8]:
def DistanceMissingFill(data):
    if np.isnan(data['Distance']):
        if not np.isnan(data['DistanceFilling_UM']):
            return int(data['DistanceFilling_UM'])
        elif not np.isnan(data['DistanceFilling_U']):
            return int(data['DistanceFilling_U'])
        elif not  np.isnan(data['DistanceFilling_M']):
            return int(data['DistanceFilling_M'])
    return data['Distance']

dfoff['Distance'] = dfoff.apply(DistanceMissingFill, axis = 1)
dftest['Distance'] = dftest.apply(DistanceMissingFill, axis = 1)

In [9]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return 0
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

0    1124438
1      36304
Name: label, dtype: int64

In [36]:
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)
# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [38]:
# 確認消費時間是否為 4 個假期
def check_new_year(x):
    start = datetime.date(2016, 2, 3)
    end = datetime.date(2016, 2, 13)
    if (np.isnan(x)) or (x==-1):
        return 0
    else:
        if start <= pd.to_datetime(x, format = "%Y%m%d") <= end:
            return 1
        else:
            return 0
def check_child(x):
    start = datetime.date(2016, 4, 2)
    end = datetime.date(2016, 4, 5)
    if (np.isnan(x)) or (x==-1):
        return 0
    else:
        if start <= pd.to_datetime(x, format = "%Y%m%d") <= end:
            return 1
        else:
            return 0
def check_mother(x):
    start = datetime.date(2016, 5, 1)
    end = datetime.date(2016, 5, 8)
    if (np.isnan(x)) or (x==-1):
        return 0
    else:
        if start <= pd.to_datetime(x, format = "%Y%m%d") <= end:
            return 1
        else:
            return 0       

dfoff['new_year'] = dfoff['Date_received'].apply(check_new_year)
dfoff['child'] = dfoff['Date_received'].apply(check_child)
dfoff['mother'] = dfoff['Date_received'].apply(check_mother)



TypeError: Cannot compare type 'Timestamp' with type 'date'

In [12]:
dftest['new_year'] = dftest['Date_received'].apply(check_new_year)
dftest['child'] = dftest['Date_received'].apply(check_child)
dftest['mother'] = dftest['Date_received'].apply(check_mother)

TypeError: Cannot compare type 'Timestamp' with type 'date'

In [13]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if np.isnan(float(row.replace(':',''))):
        return 0
    else:
        return 1

def convertRate(row):
    """Convert discount to rate"""
    if np.isnan(float(row.replace(':',''))):
        return 1
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0
    
def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    
            
            
    
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

In [14]:
cnn_feature = ['discount_rate',
                    'discount_type',
                    'discount_jian',
                    'discount_man',
                    'Distance', 
                    'weekday_type',
                    'new_year',
                    'dragon',
                    'mother',
                    'child',]

In [15]:
training_X = dfoff[cnn_feature].values
training_Y = dfoff['label'].values

KeyError: "['child', 'mother', 'dragon', 'new_year'] not in index"

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
from keras import metrics

model = Sequential()
model.add(Dense(128, input_dim=10, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer = 'adam',
              metrics = [metrics.categorical_accuracy])

es = EarlyStopping(monitor='val_loss', patience=5)

model.fit(training_X, to_categorical(training_Y),
          epochs=100,
          batch_size=1024,
          shuffle=True,
          validation_split=0.5,
          callbacks=[es])

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


NameError: name 'training_X' is not defined

In [17]:

test_X = dftest[cnn_feature].values
results = model.predict(test_X)


KeyError: "['child', 'mother', 'dragon', 'new_year'] not in index"

In [18]:
targetset = dftest.copy()
testset = targetset[cnn_feature].copy()
test1 = testset.copy()

test1['pred_prob'] = results[:, 1]

KeyError: "['child', 'mother', 'dragon', 'new_year'] not in index"

In [19]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

NameError: name 'test1' is not defined

In [20]:
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("cnn_adam_2.csv", header=["uid", "label"], index=False) # submission format
out.head()

NameError: name 'output' is not defined

In [21]:

def tuneParamsRandom(classifier, params, train_x, train_y, cv = 5):
    rs = RandomizedSearchCV(classifier, params, n_iter = 20, scoring = 'roc_auc', n_jobs = -1, verbose = 0, cv=cv)
    rs.fit(train_x, train_y)
    
    return rs.best_params_, abs(rs.best_score_)

In [22]:
etParams = {'n_estimators':np.arange(100,1100,50), 'max_depth':np.arange(3,11,2),
            'min_samples_leaf':np.arange(1,6,2) }

rfParams = {'n_estimators':np.arange(100,1100,50), 'max_depth':np.arange(3,11,2),
            'min_samples_split':np.arange(2,20,2), 'min_samples_leaf':np.arange(1,6,2) }

abParams = {'n_estimators':np.arange(100,1100,50),'learning_rate':np.arange(0.01,0.2,0.05)}


lgbmcParams = {'n_estimators' : np.arange(400,1200,100), 'learning_rate' : np.arange(0.01,0.1,0.02),
               'num_leaves' : np.arange(2,48,4), 'max_depth' : np.arange(3,10,2),
               'subsample' : np.arange(0.3,0.8,0.1) }

gbcParams = {'n_estimators' : np.arange(400,1200,100) , 'learning_rate' : np.arange(0.01,0.1,0.02) 
             , 'min_samples_split' : np.arange(2,30,5), 'min_samples_leaf' : np.arange(2,32,4),
              'max_depth' : np.arange(3,8,2), 'subsample' : np.arange(0.3,0.8,0.1)}

xgbParams = {'max_depth':np.arange(3,8,1),'learning_rate':np.arange(0.01,0.1,0.02)
             ,'n_estimator': np.arange(1000,3000,100),'gamma':np.arange(0.01,0.1,0.02)}

lrParams = {'C':np.arange(0.01,1,0.05), 'max_iter' : np.arange(100,500,100)}

In [23]:
train_label = dfoff.label
train_x = dfoff[cnn_feature]

train_label.index = train_x.index
tuneSet = pd.concat([train_x,train_label], axis = 1)
trainSet = tuneSet.sample(frac=0.5)
trainSet.shape

KeyError: "['child', 'mother', 'dragon', 'new_year'] not in index"

In [24]:
x_train, x_test, y_train, y_test = train_test_split(trainSet[trainSet.columns[trainSet.columns != 'label']],trainSet['label'], test_size = 0.3, random_state = 1234)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

NameError: name 'trainSet' is not defined

In [25]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from mlxtend.regressor import StackingRegressor
import sklearn.metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV


ModuleNotFoundError: No module named 'xgboost'

In [None]:
gbc = GradientBoostingClassifier(max_features = 'sqrt')
gbc_best_Params,gbc_best_score = tuneParamsRandom(gbc,gbcParams,x_train,y_train)
print("GradientBoostingClassifier:",gbc_best_Params,gbc_best_score)


gbc = GradientBoostingClassifier(**gbc_best_Params,max_features = 'sqrt')
gbc.fit(train_x, train_label)
gbc_pred = gbc.predict_proba(test_X)[:,1]

importances = pd.DataFrame(gbc.feature_importances_, columns = ['importances'], index = train_x.columns)
importances.importances.sort_values(ascending = False)

In [26]:
lgbmc_best_Params, lgbmc_best_score = tuneParamsRandom(LGBMClassifier(),lgbmcParams,x_train,y_train)
print("LGBMClassifier:",lgbmc_best_Params,lgbmc_best_score)


lgbmc = LGBMClassifier(**lgbmc_best_Params)
lgbmc.fit(train_x,train_label)
lgbmc_pred = lgbmc.predict_proba(test_X)[:,1]

NameError: name 'LGBMClassifier' is not defined

In [27]:
xgb_best_Params,xgb_best_score = tuneParamsRandom(XGBClassifier(),xgbParams, x_train, y_train)
print("XGBClassifier:",xgb_best_Params,xgb_best_score)

xgb = XGBClassifier(**xgb_best_Params)
xgb.fit(train_x,train_label)
xgb_pred = xgb.predict_proba(test_X)[:,1]

NameError: name 'XGBClassifier' is not defined

In [28]:
test_X = dftest[dfoff[cnn_feature].columns]


KeyError: "['child', 'mother', 'dragon', 'new_year'] not in index"

In [29]:
xgb = XGBClassifier(**xgb_best_Params)
xgb.fit(train_x,train_label)
xgb_pred = xgb.predict_proba(test_X)[:,1]

NameError: name 'XGBClassifier' is not defined

In [30]:
blending_pred = gbc_pred * 0.34 + lgbmc_pred * 0.33 + xgb_pred * 0.33

NameError: name 'gbc_pred' is not defined

In [31]:
from mlxtend.classifier import StackingClassifier
gbc = GradientBoostingClassifier(**gbc_best_Params)
lgbmc = LGBMClassifier(**lgbmc_best_Params)
xgb = XGBClassifier(**xgb_best_Params)

#gbc = GradientBoostingClassifier()
#lgbmc = LGBMClassifier()
#xgb = XGBClassifier()

meta_estimator = LGBMClassifier()
stacking = StackingClassifier(classifiers = [gbc,lgbmc, xgb], meta_classifier = meta_estimator)
stacking.fit(x_train,y_train)
stacking_pred = stacking.predict_proba(test_X)[:,1]

NameError: name 'gbc_best_Params' is not defined

In [32]:
test1['pred_prob'] = blending_pred

NameError: name 'blending_pred' is not defined

In [33]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

NameError: name 'test1' is not defined

In [34]:
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("blending_pred.csv", header=["uid", "label"], index=False) # submission format
out.head()

NameError: name 'output' is not defined