In [None]:
import engines
from utils import *
import pandas as pd
import numpy as np
import gzip
import io
np.random.seed(2016)

def make_submission(f, Y_test, C):
    Y_ret = []
    with Timer("make submission"):
        f.write("ncodpers,added_products\n".encode('utf-8'))
        for c, y_test in zip(C, Y_test):
            y_prods = [(y,p,ip) for y,p,ip in zip(y_test, products, range(len(products)))]
            y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
            Y_ret.append([ip for y,p,ip in y_prods])
            y_prods = [p for y,p,ip in y_prods]
            f.write(("%s,%s\n" % (int(c), " ".join(y_prods))).encode('utf-8'))
    return Y_ret


# uses designates tr_date as test and all data before as train
def train_predict(all_df, features, prod_features, str_date, cv):
    test_date = date_to_int(str_date)
    train_df = all_df[all_df.int_date < test_date]
    test_df = pd.DataFrame(all_df[all_df.int_date == test_date])
    print(sorted(set(train_df.columns.values.tolist()))) # print colnames
    print(len(train_df.columns.values.tolist()), len(set(train_df.columns.values.tolist()))) # check duplicate
    print(len(features),len(set(features))) # check duplicate

    ### LEARNT clever, smart method to get the purchase
    # subset train data to purchases only
    # get single multi-class target as well
    X = []
    Y = []
    for i, prod in enumerate(products):
        prev = prod + "_prev1"
        prX = train_df[(train_df[prod] == 1) & (train_df[prev] == 0)] # select those who purchased a product
        prY = np.zeros(prX.shape[0], dtype=np.int8) + i
        X.append(prX)
        Y.append(prY)
        print(prod, prX.shape)


    XY = pd.concat(X)
    Y = np.hstack(Y)
    XY["y"] = Y
    XY["url"] = np.zeros(len(XY), dtype=np.int8) ### WHY is this url here for?
    # XY is now train data with actual purchases and its target labels

    del train_df
    del all_df


    XY["ncodepers_fecha_dato"] = XY["ncodpers"].astype(str) + XY["fecha_dato"]
    uniqs, counts = np.unique(XY["ncodepers_fecha_dato"], return_counts=True)
    weights = np.exp(1/counts - 1)
    # LEARNT giving exponential less weight to same user data in each month, due to multiple purchases
    print(np.unique(counts, return_counts=True))
    print(np.unique(weights, return_counts=True))
    wdf = pd.DataFrame()
    wdf["ncodepers_fecha_dato"] = uniqs
    wdf["counts"] = counts
    wdf["weight"] = weights
    print("before merge", len(XY))
    # merge unique counts and its weights to main data
    XY = XY.merge(wdf, on="ncodepers_fecha_dato")
    print("after merge", len(XY))

    print(XY.shape)

    mask = np.random.rand(len(XY)) < 0.8 # 80 percent as train, 20 percent as valid
    XY_train = XY[mask]
    XY_validate = XY[~mask]

    with Timer("prepare test data"):
        test_df["y"] = test_df["ncodpers"]
        test_df["url"] = np.zeros(len(test_df), dtype=np.int8)
        test_df["weight"] = np.ones(len(test_df), dtype=np.int8) # weight of one to test data
        Y_prev = test_df.as_matrix(columns=prod_features) # lag-1 products
        C = test_df.as_matrix(columns=["ncodpers"])
        for prod in products:
            prev = prod + "_prev1"
            padd = prod + "_add"
            test_df[padd] = test_df[prod] - test_df[prev]
            # cv = True,  test_df has this value, and calculates purchase
            # cv = False, test_df does not have test_df[prod] WHY
        test_add_mat = test_df.as_matrix(columns=[prod + "_add" for prod in products])
        test_add_list = [list() for i in range(len(C))] # list of empty list with size equal to test_df row
        assert test_add_mat.shape == (len(C), len(products))
        count = 0
        for c in range(len(C)):
            for p in range(len(products)):
                if test_add_mat[c,p] > 0:
                    test_add_list[c].append(p)
                    count += 1
        # test_add_list is a list of purchased items per user in test_df
        # this is only useful for cv purpose, not for actual predcition on real test_df

    if cv:
        max_map7 = mapk(test_add_list, test_add_list, 7, 0.0)
        map7coef = float(len(test_add_list)) / float(sum([int(bool(a)) for a in test_add_list]))
        print("Max MAP@7", str_date, max_map7, max_map7*map7coef)

    with Timer("XGBoost"):
        Y_test_xgb = engines.xgboost(XY_train, XY_validate, test_df, features, XY_all = XY,
            restore = (str_date == "2016-06-28")
        )
        # LEARNT doing Y_test_xgb - Y_prev is removing predictions if bought in prev month!
        test_add_list_xgboost = make_submission(io.BytesIO() if cv else gzip.open("output/8th.%s.xgb.csv.gz" % str_date, "wb"),
                                                Y_test_xgb - Y_prev, C)
        if cv:
            map7xgboost = mapk(test_add_list, test_add_list_xgboost, 7, 0.0)
            print("XGBoost MAP@7", str_date, map7xgboost, map7xgboost*map7coef)


In [None]:
import pickle
all_df = pickle.load(open('data/8th.feature_engineer.all.pkl', 'rb'))
features, prod_features = pickle.load(open('data/8th.feature_engineer.cv_meta.pkl', 'rb'))

train_predict(all_df, features, prod_features, "2016-05-28", cv=True)
train_predict(all_df, features, prod_features, "2016-06-28", cv=False)

소요 시간 : ~3시간
점수 : 
  - Public : 0.0305956
  - Private : 0.0309524
  - Rank : 13

In [None]:
def apk(actual, predicted, k=10, default=1.0):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return default

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10, default=1.0):
    return np.mean([apk(a,p,k,default) for a,p in zip(actual, predicted)]) ### LEARNT good use of zip in loop

In [None]:
Y_test_lgbm = engines.lightgbm(XY_train, XY_validate, test_df, features, XY_all = XY, restore = (str_date == "2016-06-28"))
test_add_list_lightgbm = make_submission(io.BytesIO() if cv else gzip.open("tmp/%s.lightgbm.csv.gz" % str_date, "wb"), Y_test_lgbm - Y_prev, C)

if cv:
    map7lightgbm = mapk(test_add_list, test_add_list_lightgbm, 7, 0.0)
    print("LightGBMlib MAP@7", str_date, map7lightgbm, map7lightgbm * map7coef)

Y_test_xgb = engines.xgboost(XY_train, XY_validate, test_df, features, XY_all = XY, restore = (str_date == "2016-06-28"))
test_add_list_xgboost = make_submission(io.BytesIO() if cv else gzip.open("tmp/%s.xgboost.csv.gz" % str_date, "wb"), Y_test_xgb - Y_prev, C)

if cv:
    map7xgboost = mapk(test_add_list, test_add_list_xgboost, 7, 0.0)
    print("XGBoost MAP@7", str_date, map7xgboost, map7xgboost * map7coef)

Y_test = np.sqrt(np.multiply(Y_test_xgb, Y_test_lgbm))
test_add_list_xl = make_submission(io.BytesIO() if cv else gzip.open("tmp/%s.xgboost-lightgbm.csv.gz" % str_date, "wb"), Y_test - Y_prev, C)
if cv:
    map7xl = mapk(test_add_list, test_add_list_xl, 7, 0.0)
    print("XGBoost + LightGBM MAP@7", str_date, map7xl, map7xl * map7coef)

In [None]:
import os
import pickle

import pandas as pd
import numpy as np

import xgboost as xgb
#import lightgbm as lgbm

from utils import *


def xgboost(XY_train, XY_validate, test_df, features, XY_all=None, restore=False):
    param = {
        'objective': 'multi:softprob',
        'eta': 0.1,
        'min_child_weight': 10,
        'max_depth': 8,
        'silent': 1,
        # 'nthread': 16,
        'eval_metric': 'mlogloss',
        'colsample_bytree': 0.8,
        'colsample_bylevel': 0.9,
        'num_class': len(products),
    }

    if not restore:
        ### LEARNT smart way to store all in pandas and split in numpy at last
        X_train = XY_train.as_matrix(columns=features)
        Y_train = XY_train.as_matrix(columns=["y"])
        W_train = XY_train.as_matrix(columns=["weight"]) ### LEARNT use of weights in xgboost
        train = xgb.DMatrix(X_train, label=Y_train, feature_names=features, weight=W_train)

        X_validate = XY_validate.as_matrix(columns=features)
        Y_validate = XY_validate.as_matrix(columns=["y"])
        W_validate = XY_validate.as_matrix(columns=["weight"])
        validate = xgb.DMatrix(X_validate, label=Y_validate, feature_names=features, weight=W_validate)

        print(param)
        evallist  = [(train,'train'), (validate,'eval')]
        model = xgb.train(param, train, 1000, evals=evallist, early_stopping_rounds=20)
        pickle.dump(model, open("next_multi.pickle", "wb")) # dump xgboost model

    else:
        model = pickle.load(open("next_multi.pickle", "rb"))
    best_ntree_limit = model.best_ntree_limit

    if XY_all is not None:
        X_all = XY_all.as_matrix(columns=features)
        Y_all = XY_all.as_matrix(columns=["y"])
        W_all = XY_all.as_matrix(columns=["weight"])
        all_data = xgb.DMatrix(X_all, label=Y_all, feature_names=features, weight=W_all)

        evallist  = [(all_data,'all_data')]
        # balance num_round (==num tree)
        best_ntree_limit = int(best_ntree_limit * (len(XY_train) + len(XY_validate)) / len(XY_train))
        model = xgb.train(param, all_data, best_ntree_limit, evals=evallist)

    print("Feature importance:")
    for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
        print(kv)

    X_test = test_df.as_matrix(columns=features)
    test = xgb.DMatrix(X_test, feature_names=features)

    return model.predict(test, ntree_limit=best_ntree_limit)


def lightgbm(XY_train, XY_validate, test_df, features, XY_all=None, restore=False):
    train = lgbm.Dataset(XY_train[list(features)], label=XY_train["y"], weight=XY_train["weight"], feature_name=features)
    validate = lgbm.Dataset(XY_validate[list(features)], label=XY_validate["y"], weight=XY_validate["weight"], feature_name=features, reference=train)

    params = {
        'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'multiclass',
        'num_class': 24,
        'metric' : {'multi_logloss'},
        'is_training_metric': True,
        'max_bin': 255,
        'num_leaves' : 64,
        'learning_rate' : 0.1,
        'feature_fraction' : 0.8,
        'min_data_in_leaf': 10,
        'min_sum_hessian_in_leaf': 5,
        # 'num_threads': 16,
    }
    print(params)

    if not restore:
        model = lgbm.train(params, train, num_boost_round=1000, valid_sets=validate, early_stopping_rounds=20)
        best_iteration = model.best_iteration
        model.save_model("tmp/lgbm.model.txt")
        pickle.dump(best_iteration, open("tmp/lgbm.model.meta", "wb"))
    else:
        model = lgbm.Booster(model_file="tmp/lgbm.model.txt")
        best_iteration = pickle.load(open("tmp/lgbm.model.meta", "rb"))

    if XY_all is not None:
        best_iteration = int(best_iteration * len(XY_all) / len(XY_train))
        all_train = lgbm.Dataset(XY_all[list(features)], label=XY_all["y"], weight=XY_all["weight"], feature_name=features)
        model = lgbm.train(params, all_train, num_boost_round=best_iteration)
        model.save_model("tmp/lgbm.all.model.txt")

    print("Feature importance by split:")
    for kv in sorted([(k,v) for k,v in zip(features, model.feature_importance("split"))], key=lambda kv: kv[1], reverse=True):
        print(kv)
    print("Feature importance by gain:")
    for kv in sorted([(k,v) for k,v in zip(features, model.feature_importance("gain"))], key=lambda kv: kv[1], reverse=True):
        print(kv)

    return model.predict(test_df[list(features)], num_iteration=best_iteration)