In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from xgboost import plot_importance
from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings("ignore")

In [10]:
user = pd.read_csv("./data/user_data.csv")
item = pd.read_csv("./data/item_data.csv")
train = pd.read_csv("./data/data_train.csv")
eval = pd.read_csv("./data/data_eval.csv")
test = pd.read_csv("./data/data_test.csv")

In [None]:
def get_xy(data, predict_date):
    '''
    input:
        data: pd.Datafram
        predict_date: str
    output:
        (x_train, y_train): tuple of pd.Dataframe
    '''
    assert isinstance(data, pd.DataFrame)
    assert isinstance(predict_date,str)
    
    end_date = predict_date[:-1] + str(int(predict_date[-1])+1)
    labels = user[(user["time"] >= predict_date)&(user["time"] < end_date)&(user["behavior_type"] == 4)][["user_id", "item_id"]]
    labels = labels.drop_duplicates()
    labels["whether_buy"] = 1
    data = pd.merge(data, labels, how="left", on=["user_id", "item_id"])
    data["whether_buy"] = data["whether_buy"].fillna(0)
    x_train = data.drop(["user_id", "item_id", "item_category", "whether_buy"], axis=1)
    y_train = data["whether_buy"]
    return (x_train, y_train)

(x_train,y_train) = get_xy(train, "2014-12-17")
(x_eval,y_eval) = get_xy(eval, "2014-12-18")
d_test = test.drop(["user_id", "item_id", "item_category"], axis=1)

In [None]:
xgb_train = xgb.DMatrix(data=x_train,label=y_train)
xgb_eval = xgb.DMatrix(data=x_eval,label=y_eval)
xbg_test = xgb.DMatrix(data=d_test)

In [None]:
param_test1 = {
            'max_depth':[4,5,6],
            'eta':[0.1,0.01,0.001],
            'lambda':[6, 8, 10],
            'gamma':[0,0.1,0.2]
            }
gsearch = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='rank:pairwise',eval_metric = 'auc'),
    param_grid = param_test1
    )
gsearch.fit(x_train,y_train)

In [None]:
params = {
    'objective': 'rank:pairwise', 
    'eval_metric': 'auc',
    'gamma': 0.1, 
    'min_child_weight': 1.1,
    'max_depth': 6,
    'lambda': 10,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'eta': 0.01,
    'tree_method':'exact',
}
res = {}

In [None]:
watch_list = [(xgb_train,'train'),(xgb_eval,'validate')]
model_xgb = xgb.train(
    params,
    xgb_train,
    num_boost_round=1000,
    evals=watch_list,
    early_stopping_rounds=100,
    evals_result = res)

In [None]:
plt.plot(res['train']['auc'], label='train')
plt.plot(res['validate']['auc'], label='validate')
plt.legend()
plt.show()

In [None]:
plot_importance(model_xgb)
plt.show()

In [None]:
model_xgb = xgb.train(params,xgb_train,num_boost_round=model_xgb.best_iteration)

In [None]:
eval = user[user["time"] < "2014-12-18"][["user_id", "item_id", "item_category"]]
eval = eval.drop_duplicates()
eval["pred"] = model_xgb.predict(xgb.DMatrix(x_eval))
eval_val = eval["pred"].values.reshape(-1, 1)
eval["pred"] = MinMaxScaler().fit_transform(eval_val)

In [None]:
thres = eval[["pred"]].sort_values(by="pred", ascending=False).iloc[500][0]
y_pred = eval["pred"].tolist()
for i in range(len(y_pred)):
    if y_pred[i] <= thres:
        y_pred[i] = 0
    else:
        y_pred[i] = 1
y_eval = y_eval.tolist()

In [None]:
acc = accuracy_score(y_pred, y_eval)
f1 = f1_score(y_pred, y_eval)
print("accuracy score: {}".format(acc))
print("F1 score : {}".format(f1))
confusion_matrix(y_pred, y_eval) 