In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from functools import reduce
warnings.filterwarnings("ignore")

In [3]:
user_data = pd.read_csv("./data/tianchi_mobile_recommend_train_user.csv")
item_data = pd.read_csv("./data/tianchi_mobile_recommend_train_item.csv")

In [26]:
user_data["time"] = pd.to_datetime(user_data["time"], format="%Y%m%d %H")

In [27]:
def reduce_mem_usage(df):
    """
    reduce the memory of dataframe 
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [28]:
# count the number of actions of users on each item
def item_action_total_counts(data):
    data["item_action_total_counts"] = 0
    feature = data[["item_id", "item_action_total_counts"]].groupby(["item_id"]).count()
    return feature

In [29]:
# count the number of operations of each user
def user_operations_total_counts(data):
    data["user_operations_total_counts"] = 0
    feature = data[["user_id", "user_operations_total_counts"]].groupby(["user_id"]).count()
    return feature

In [30]:
# calculate the time difference between first time a user checks the item and the time a user buy the item
def check_to_buy(data):
    buy_user_item_id = data[data["behavior_type"] == 4][["user_id", "item_id"]].drop_duplicates()
    data = pd.merge(buy_user_item_id, data, how="left", on=["user_id", "item_id"])[["user_id", "item_id", "behavior_type", "time"]]
    first_time_check = data[data["behavior_type"] == 1].groupby(["user_id", "item_id"],as_index=False).min()
    first_time_check.rename(columns={"time": "first_time_check"}, inplace=True)
    first_time_buy = data[data["behavior_type"] == 4].groupby(["user_id", "item_id"],as_index=False).min()
    first_time_buy.rename(columns={"time": "first_time_buy"}, inplace=True)
    feature = pd.merge(first_time_buy, first_time_check, how='left',on=["user_id", "item_id"])
    feature["check_to_buy_time_difference"] = (feature["first_time_buy"] - feature["first_time_check"]).dt.total_seconds()
    feature = feature.drop(columns=['behavior_type_x','first_time_buy','behavior_type_y','first_time_check'])
    feature = feature[feature['check_to_buy_time_difference'].notnull()]
    #feature = feature.loc[(feature != 0).all(axis=1), :]
    return feature

In [31]:
# calculate the time difference between first time a user add the item to cart and the time a user buy the item
def add_to_buy(data):
    buy_user_item_id = data[data["behavior_type"] == 4][["user_id", "item_id"]].drop_duplicates()
    data = pd.merge(buy_user_item_id, data, how="left", on=["user_id", "item_id"])[["user_id", "item_id", "behavior_type", "time"]]
    first_time_add = data[data["behavior_type"] == 3].groupby(["user_id", "item_id"],as_index=False).min()
    first_time_add.rename(columns={"time": "first_time_add"}, inplace=True)
    first_time_buy = data[data["behavior_type"] == 4].groupby(["user_id", "item_id"],as_index=False).min()
    first_time_buy.rename(columns={"time": "first_time_buy"}, inplace=True)
    feature = pd.merge(first_time_buy, first_time_add, how='left',on=["user_id", "item_id"])
    feature["add_to_buy_time_difference"] = (feature["first_time_buy"] - feature["first_time_add"]).dt.total_seconds()
    feature = feature.drop(columns=['behavior_type_x','first_time_buy','behavior_type_y','first_time_add'])
    feature = feature[feature['add_to_buy_time_difference'].notnull()]
    #feature = feature.loc[(feature != 0).all(axis=1), :]
    return feature

In [32]:
# calculate the time difference between first time a user save the item and the time a user buy the item
def save_to_buy(data):
    buy_user_item_id = data[data["behavior_type"] == 4][["user_id", "item_id"]].drop_duplicates()
    data = pd.merge(buy_user_item_id, data, how='left', on=["user_id", "item_id"])[["user_id", "item_id", "behavior_type", "time"]]
    first_time_save = data[data["behavior_type"] == 2].groupby(["user_id", "item_id"],as_index=False).min()
    first_time_save.rename(columns={"time": "first_time_save"}, inplace=True)
    first_time_buy = data[data["behavior_type"] == 4].groupby(["user_id", "item_id"],as_index=False).min()
    first_time_buy.rename(columns={"time": "first_time_buy"}, inplace=True)
    feature = pd.merge(first_time_buy, first_time_save, how='left',on=["user_id", "item_id"])
    feature["save_to_buy_time_difference"] = (feature["first_time_buy"] - feature["first_time_save"]).dt.total_seconds()
    feature = feature.drop(columns=['behavior_type_x','first_time_buy','behavior_type_y','first_time_save'])
    feature = feature[feature['save_to_buy_time_difference'].notnull()]
    #feature = feature.loc[(feature != 0).all(axis=1), :]
    return feature

In [33]:
def item_type_counts(data, choose_behavior_type, name):
    data[name] = 1
    feature = data[data["behavior_type"] == choose_behavior_type][["item_id", name]].groupby(["item_id"]).count()
    return feature

In [34]:
def user_type_counts(data, choose_behavior_type, name):
    data[name] = 1
    feature = data[data["behavior_type"] == choose_behavior_type][["user_id", name]].groupby(["user_id"]).count()
    return feature

In [35]:
def merge_user(table1, table2):
    data = pd.merge(table1, table2, on="user_id")
    return data
def merge_item(table1, table2):
    data = pd.merge(table1, table2, on="item_id")
    return data
def merge_time_difference(table1, table2):
    data = table2.merge(table1, how = 'inner', on = ['user_id', 'item_id'])
    return data

In [36]:
def get_feature(predict_date):
    train_data = user_data[user_data["time"] < predict_date]
    i1 = item_action_total_counts(train_data)
    i2 = item_type_counts(train_data, 1, "item_check_counts")
    i3 = item_type_counts(train_data, 2, "item_save_counts")
    i4 = item_type_counts(train_data, 3, "item_add_counts")
    i5 = item_type_counts(train_data, 4, "item_buy_counts")

    u1 = user_operations_total_counts(train_data)
    u2 = user_type_counts(train_data, 1, "user_check_counts")
    u3 = user_type_counts(train_data, 2, "user_save_counts")
    u4 = user_type_counts(train_data, 3, "user_add_counts")
    u5 = user_type_counts(train_data, 4, "user_buy_counts")
    
    t1 = check_to_buy(train_data)
    t2 = save_to_buy(train_data)    
    t3 = add_to_buy(train_data)    
    
    train_data = train_data[["user_id", "item_id"]].drop_duplicates()
    
    train_data = ft.reduce(lambda tb1, tb2: pd.merge(tb1, tb2, how='left', on="item_id"),[train_data, i1, i2, i3, i4, i5])
    train_data = ft.reduce(lambda tb1, tb2: pd.merge(tb1, tb2, how='left', on="user_id"), [train_data, u1, u2, u3, u4, u5])
    train_data = ft.reduce(lambda tb1, tb2: tb2.merge(tb1, how = 'inner', on = ['user_id', 'item_id']), [train_data, t1,t2,t3])
    return train_data

In [37]:
# user action count
def user_action_total_counts(data):
    data["user_action_total_counts"] = 1
    feature = data[["user_id", "user_action_total_counts"]].groupby(["user_id"], as_index=False).count()
    return feature
#user type count
def user_type_counts(data, type, name):
    data[name] = 1
    feature = data[data["behavior_type"] == type][["user_id", name]].groupby(["user_id"], as_index=False).count()
    return feature

# the number of item actions
def item_action_total_counts(data):
    data["item_action_total_counts"] = 1
    feature = data[["item_id", "item_action_total_counts"]].groupby(["item_id"], as_index=False).count()
    return feature
# user counts
def item_total_user_counts(data):
    data = data[["user_id", "item_id"]].drop_duplicates()
    data["item_total_user_counts"] = 1
    feature = data[["item_id", "item_total_user_counts"]].groupby(["item_id"], as_index=False).count()
    return feature
# type counts
def item_type_counts(data, type, name):
    data[name] = 1
    feature = data[data["behavior_type"] == type][["item_id", name]].groupby(["item_id"], as_index=False).count()
    return feature

# counts for each category
def category_action_total_counts(data):
    data["category_action_total_counts"] = 1
    feature = data[["item_category", "category_action_total_counts"]].groupby(["item_category"], as_index=False).count()
    return feature
# for each category: user counts
def category_total_user_counts(data):
    data = data[["user_id", "item_category"]].drop_duplicates()
    data["category_total_user_counts"] = 1
    feature = data[["item_category", "category_total_user_counts"]].groupby(["item_category"], as_index=False).count()
    return feature
# type counts
def category_type_counts(data, type, name):
    data[name] = 1
    feature = data[data["behavior_type"] == type][["item_category", name]].groupby(["item_category"], as_index=False).count()
    return feature


def user_item_action_total_counts(data):
    data["user_item_action_total_counts"] = 1
    feature = data[["user_id", "item_id", "user_item_action_total_counts"]].groupby(["user_id", "item_id"], as_index=False).count()
    return feature

def user_item_type_counts(data, type, name):
    data[name] = 1
    feature = data[data["behavior_type"] == type][["user_id", "item_id", name]].groupby(["user_id", "item_id"], as_index=False).count()
    return feature

def user_item_last_type_time(data, type, name):
    feature = data[data["behavior_type"] == type][["user_id", "item_id", "time"]].groupby(["user_id", "item_id"], as_index=False).max()
    feature.rename(columns={"time": name}, inplace=True)
    return feature


def user_item_look_to_buy(data):
    buy_user_item = data[data["behavior_type"] == 4][["user_id", "item_id"]].drop_duplicates()
    data = pd.merge(buy_user_item, data, how="left", on=["user_id", "item_id"])[["user_id", "item_id", "behavior_type", "time"]]
    earliest_look = data[data["behavior_type"] == 1].groupby(["user_id", "item_id"], as_index=False).min()
    earliest_look.rename(columns={"time": "earliest_look_time"}, inplace=True)
    earliest_buy = data[data["behavior_type"] == 4].groupby(["user_id", "item_id"], as_index=False).min()
    earliest_buy.rename(columns={"time": "earliest_buy_time"}, inplace=True)
    earliest_add = data[data["behavior_type"] == 3].groupby(["user_id", "item_id"], as_index=False).min()
    earliest_add.rename(columns={"time": "earliest_add_time"}, inplace=True)
    feature = pd.merge(earliest_buy, earliest_look, how="left", on=["user_id", "item_id"])
    feature["earliest_user_item_timedelta_look_to_buy"] = (feature["earliest_buy_time"] - feature["earliest_look_time"]).dt.total_seconds()/360000
    feature = feature[feature["earliest_user_item_timedelta_look_to_buy"] >= 0]
    feature = feature[["user_id", "item_id", "earliest_look_time", "earliest_buy_time", "earliest_user_item_timedelta_look_to_buy"]]
    data = pd.merge(feature, data, how="left", on=["user_id", "item_id"])
    data = data[(data["behavior_type"] == 1)&(data["time"] <= data["earliest_buy_time"])]
    data["item_look_counts_before_buy"] = 1
    item_look_counts_before_buy = data[["user_id", "item_id", "item_look_counts_before_buy"]].groupby(["user_id", "item_id"], as_index=False).count()
    feature = pd.merge(feature, item_look_counts_before_buy, how="left", on=["user_id", "item_id"])
    return feature[["user_id", "item_id", "item_look_counts_before_buy", "earliest_user_item_timedelta_look_to_buy"]]


def user_category_action_total_counts(data):
    data["user_category_action_total_counts"] = 1
    feature = data[["user_id", "item_category", "user_category_action_total_counts"]].groupby(["user_id", "item_category"], as_index=False).count()
    return feature

def user_category_type_counts(data, type, name):
    data[name] = 1
    feature = data[data["behavior_type"] == type][["user_id", "item_category", name]].groupby(["user_id", "item_category"], as_index=False).count()
    return feature
# the last time
def user_category_last_type_time(data, type, name):
    feature = data[data["behavior_type"] == type][["user_id", "item_category", "time"]].groupby(["user_id", "item_category"], as_index=False).max()
    feature.rename(columns={"time": name}, inplace=True)
    return feature

def user_category_look_to_buy(data):
    buy_user_item = data[data["behavior_type"] == 4][["user_id", "item_category"]].drop_duplicates()

    data = pd.merge(buy_user_item, data, how="left", on=["user_id", "item_category"])[["user_id", "item_category", "behavior_type", "time"]]

    earliest_look = data[data["behavior_type"] == 1].groupby(["user_id", "item_category"], as_index=False).min()
    earliest_look.rename(columns={"time": "earliest_look_time"}, inplace=True)

    earliest_buy = data[data["behavior_type"] == 4].groupby(["user_id", "item_category"], as_index=False).min()
    earliest_buy.rename(columns={"time": "earliest_buy_time"}, inplace=True)

    earliest_add = data[data["behavior_type"] == 3].groupby(["user_id", "item_category"], as_index=False).min()
    earliest_add.rename(columns={"time": "earliest_add_time"}, inplace=True)

    feature = pd.merge(earliest_buy, earliest_look, how="left", on=["user_id", "item_category"])
    feature["earliest_user_category_timedelta_look_to_buy"] = (feature["earliest_buy_time"] - feature["earliest_look_time"]).dt.total_seconds()/3600
    feature = feature[feature["earliest_user_category_timedelta_look_to_buy"] >= 0]
    feature = feature[["user_id", "item_category", "earliest_look_time", "earliest_buy_time", "earliest_user_category_timedelta_look_to_buy"]]

    data = pd.merge(feature, data, how="left", on=["user_id", "item_category"])
    data = data[(data["behavior_type"] == 1)&(data["time"] <= data["earliest_buy_time"])]
    data["category_look_counts_before_buy"] = 1
    category_look_counts_before_buy = data[["user_id", "item_category", "category_look_counts_before_buy"]].groupby(["user_id", "item_category"], as_index=False).count()
    feature = pd.merge(feature, category_look_counts_before_buy, how="left", on=["user_id", "item_category"])

    return feature[["user_id", "item_category", "category_look_counts_before_buy", "earliest_user_category_timedelta_look_to_buy"]]

In [38]:
def merge_user(data1, data2):
    data = pd.merge(data1, data2, how="left", on="user_id")
    return data
def merge_item(data1, data2):
    data = pd.merge(data1, data2, how="left", on="item_id")
    return data
def merge_category(data1, data2):
    data = pd.merge(data1, data2, how="left", on="item_category")
    return data
def merge_user_item(data1, data2):
    data = pd.merge(data1, data2, how="left", on=["user_id", "item_id"])
    return data
def merge_user_category(data1, data2):
    data = pd.merge(data1, data2, how="left", on=["user_id", "item_category"])
    return data

In [39]:
# feature function
def get_feature(predict_date):
    train_data = user_data[user_data["time"] < predict_date]

    ui2 = user_item_type_counts(train_data, 1, "user_item_look_counts")

    ui5 = user_item_type_counts(train_data, 4, "user_item_buy_counts")

    ui6 = user_item_last_type_time(train_data, 1, "user_item_last_look_time")

    ui7 = user_item_last_type_time(train_data, 2, "user_item_last_like_time")

    ui8 = user_item_last_type_time(train_data, 3, "user_item_last_add_time")

    ui9 = user_item_last_type_time(train_data, 4, "user_item_last_buy_time")

    ui10 = user_item_look_to_buy(train_data)

    uc2 = user_category_type_counts(train_data, 1, "user_category_look_counts")

    uc5 = user_category_type_counts(train_data, 4, "user_category_buy_counts")

    uc6 = user_category_last_type_time(train_data, 1, "user_category_last_look_time")

    uc7 = user_category_last_type_time(train_data, 2, "user_category_last_like_time")

    uc8 = user_category_last_type_time(train_data, 3, "user_category_last_add_time")

    uc9 = user_category_last_type_time(train_data, 4, "user_category_last_buy_time")

    uc10 = user_category_look_to_buy(train_data)
    

    train_data = train_data[["user_id", "item_id", "item_category"]].drop_duplicates()

    train_data = reduce(merge_user_item, [train_data, ui2, ui5, ui6, ui7, ui8, ui9, ui10])
    train_data = reduce(merge_user_category, [train_data, uc2, uc5, uc6, uc7, uc8, uc9, uc10])

    train_data["user_item_last_look_to_now"] = (pd.to_datetime(predict_date) - train_data["user_item_last_look_time"]).dt.total_seconds()/3600
    train_data["user_item_last_like_to_now"] = (pd.to_datetime(predict_date) - train_data["user_item_last_like_time"]).dt.total_seconds()/3600
    train_data["user_item_last_add_to_now"] = (pd.to_datetime(predict_date) - train_data["user_item_last_add_time"]).dt.total_seconds()/3600
    train_data["user_item_last_buy_to_now"] = (pd.to_datetime(predict_date) - train_data["user_item_last_buy_time"]).dt.total_seconds()/3600
    train_data["user_category_last_look_to_now"] = (pd.to_datetime(predict_date) - train_data["user_category_last_look_time"]).dt.total_seconds()/3600
    train_data["user_category_last_like_to_now"] = (pd.to_datetime(predict_date) - train_data["user_category_last_like_time"]).dt.total_seconds()/3600
    train_data["user_category_last_add_to_now"] = (pd.to_datetime(predict_date) - train_data["user_category_last_add_time"]).dt.total_seconds()/3600
    train_data["user_category_last_buy_to_now"] = (pd.to_datetime(predict_date) - train_data["user_category_last_buy_time"]).dt.total_seconds()/3600

    drop_columns = ["user_item_last_look_time", "user_item_last_like_time", "user_item_last_add_time", "user_item_last_buy_time"]
    drop_columns += ["user_category_last_look_time", "user_category_last_like_time", "user_category_last_add_time", "user_category_last_buy_time"]
    train_data = train_data.drop(drop_columns, axis=1)

    fill_columns = ["user_item_look_counts", "user_item_buy_counts"]
    fill_columns += ["user_category_look_counts", "user_category_buy_counts"]
    train_data[fill_columns] = train_data[fill_columns].fillna(0)
    return train_data

In [41]:
data_train = get_feature("2014-12-17")
data_eval = get_feature("2014-12-18")
data_test = get_feature("2014-12-19")
data_train.to_csv("./data/data_train.csv", index=False)
data_eval.to_csv("./data/data_eval.csv", index=False)
data_test.to_csv("./data/data_test.csv", index=False)
user_data.to_csv("./data/user_data.csv", index=False)
item_data.to_csv("./data/item_data.csv", index=False)