In [6]:
!kaggle datasets download -d skillsmuggler/amazon-ratings --unzip -p ../data/

Downloading amazon-ratings.zip to ../data
 90%|██████████████████████████████████▎   | 26.0M/28.8M [00:01<00:00, 23.9MB/s]
100%|██████████████████████████████████████| 28.8M/28.8M [00:01<00:00, 25.9MB/s]


In [8]:
import pandas as pd
import numpy as np

# download from https://www.kaggle.com/skillsmuggler/amazon-ratings
ratings = pd.read_csv('../data/ratings_Beauty.csv')

In [17]:
ratings.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
2,A1Z513UWSAAO0F,0558925278,5.0,2014-07-07
5,AKJHHD5VEH7VG,0762451459,5.0,2014-07-05
8,A3V3RE4132GKRO,130414089X,5.0,2014-06-04
26,A1RXI3A1E99112,1304351475,5.0,2014-07-14
34,A3SWQ2QQ7JBPFA,1304351475,5.0,2014-06-15


In [9]:
import datetime
# datetimeに変換
ratings['Timestamp'] = ratings['Timestamp'].map(lambda x: datetime.datetime.fromtimestamp(x))
print(f"from:{min(ratings['Timestamp'])}, max:{max(ratings['Timestamp'])}")

from:1998-10-19 00:00:00, max:2014-07-23 00:00:00


In [10]:
ratings = ratings[
    (ratings['Timestamp'].dt.year >= 2014)
    & (ratings['Timestamp'].dt.month >= 5)
]

In [11]:
# 期間で学習データ・テストデータ、説明変数・目的変数を分割する際に使用
start = min(ratings['Timestamp'])
end = max(ratings['Timestamp'])
interval = end - start
# 学習データとテストデータの分割
train = ratings[ratings['Timestamp'] <= (end - interval/3)]
test = ratings[ratings['Timestamp'] >= (start + interval/3)]
# 説明変数、目的変数の期間分割
train_y = train[train['Timestamp'] >= (start + interval/3)]
train_X = train[train['Timestamp'] < (start + interval/3)]
test_y = test[test['Timestamp'] >= (end - interval/3)]
test_X = test[test['Timestamp'] < (end - interval/3)]
# 説明変数、目的変数に共通するユーザー
train_tgt_user = set(train_X['UserId']) & set(train_y['UserId'])
test_tgt_user = set(test_X['UserId']) & set(test_y['UserId'])

In [12]:
def get_feature_by_user(df):
    """
    UserId単位の特徴量を取得する
    """
    res = list()
    for i, v in df.groupby('UserId'):
        res.append(
            (
                i,
                len(v['ProductId']),
                (v['Rating'] == 5).sum(),
                (v['Rating'] == 4).sum(),
                (v['Rating'] == 3).sum(),
                (v['Rating'] == 2).sum(),
                (v['Rating'] == 1).sum(),
                (v['Timestamp'].dt.dayofweek == 0).sum(),
                (v['Timestamp'].dt.dayofweek == 1).sum(),
                (v['Timestamp'].dt.dayofweek == 2).sum(),
                (v['Timestamp'].dt.dayofweek == 3).sum(),
                (v['Timestamp'].dt.dayofweek == 4).sum(),
                (v['Timestamp'].dt.dayofweek == 5).sum(),
                (v['Timestamp'].dt.dayofweek == 6).sum()
            )
        )
    
    res = pd.DataFrame(
        res,
        columns=[
            'UserId', 'p_cnt_u', 'rating_5_u', 'rating_4_u',
            'rating_3_u', 'rating_2_u', 'rating_1_u',
            'act_mon_u', 'act_tue_u', 'act_wed_u', 'act_thu_u',
            'act_fri_u', 'act_sat_u', 'act_sun_u'
        ])
    return res


def get_feature_by_product(df):
    """
    ProductId単位の特徴量を取得する
    """
    res = list()
    for i, v in df.groupby('ProductId'):
        res.append(
            (
                i,
                len(v['UserId']),
                (v['Rating'] == 5).sum(),
                (v['Rating'] == 4).sum(),
                (v['Rating'] == 3).sum(),
                (v['Rating'] == 2).sum(),
                (v['Rating'] == 1).sum(),
                (v['Timestamp'].dt.dayofweek == 0).sum(),
                (v['Timestamp'].dt.dayofweek == 1).sum(),
                (v['Timestamp'].dt.dayofweek == 2).sum(),
                (v['Timestamp'].dt.dayofweek == 3).sum(),
                (v['Timestamp'].dt.dayofweek == 4).sum(),
                (v['Timestamp'].dt.dayofweek == 5).sum(),
                (v['Timestamp'].dt.dayofweek == 6).sum()
            )
        )
    
    res = pd.DataFrame(
        res,
        columns=[
            'ProductId', 'u_cnt_p', 'rating_5_p', 'rating_4_p',
            'rating_3_p', 'rating_2_p', 'rating_1_p',
            'act_mon_p', 'act_tue_p', 'act_wed_p', 'act_thu_p',
            'act_fri_p', 'act_sat_p', 'act_sun_p'
        ])
    return res
# ユーザーごとの特徴量作成
train_X_u = get_feature_by_user(train_X)
test_X_u = get_feature_by_user(test_X)
# アイテムごとの特徴量作成
train_X_p = get_feature_by_product(train_X)
test_X_p = get_feature_by_product(test_X)

In [13]:
def get_model_input(X_u, X_p, y, tgt_user):
    """
    LGBMRankerに入力するデータを取得する
    """
    merged = pd.merge(X_u, y, on=['UserId'], how='inner')
    merged = pd.merge(X_p, merged, on=['ProductId'], how='outer')
    merged = merged.query('UserId in @tgt_user')
    # nullの場合は0で補完
    merged.fillna(0, inplace=True)
    features_cols = list(merged.drop(columns=['UserId', 'ProductId', 'Rating', 'Timestamp']).columns)
    # 検索クエリ
    query_list = merged['UserId'].value_counts()
    # UserId, ProductIdをインデックス化
    merged = merged.set_index(['UserId', 'ProductId'])
    # クエリリストをインデックスでソート
    query_list = query_list.sort_index()
    # 特徴量と目的変数データをインデックスでソート
    merged.sort_index(inplace=True)
    # 特徴量
    df_x = merged[features_cols]
    # 目的変数
    df_y = merged['Rating']
    
    return df_x, df_y, query_list

X_train, y_train, query_list_train = get_model_input(train_X_u, train_X_p, train_y, train_tgt_user)
X_test, y_test, query_list_test = get_model_input(test_X_u, test_X_p, test_y, test_tgt_user)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [26]:
X_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,u_cnt_p,rating_5_p,rating_4_p,rating_3_p,rating_2_p,rating_1_p,act_mon_p,act_tue_p,act_wed_p,act_thu_p,...,rating_3_u,rating_2_u,rating_1_u,act_mon_u,act_tue_u,act_wed_u,act_thu_u,act_fri_u,act_sat_u,act_sun_u
UserId,ProductId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A00473363TJ8YSZ3YAGG9,B000052YQU,2.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,3.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0
A0301290UITIQ0E8OXP6,B0083QNBCM,30.0,15.0,4.0,6.0,2.0,3.0,4.0,9.0,3.0,6.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A0301290UITIQ0E8OXP6,B0091NM2W6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A08200421CXFIGYA5JKE9,B00HNSSHQ6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
A09946492JIP71Y3ES0Q0,B0072FSML0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
y_train.head()

UserId                 ProductId 
A00473363TJ8YSZ3YAGG9  B000052YQU    2.0
A0301290UITIQ0E8OXP6   B0083QNBCM    5.0
                       B0091NM2W6    4.0
A08200421CXFIGYA5JKE9  B00HNSSHQ6    5.0
A09946492JIP71Y3ES0Q0  B0072FSML0    5.0
Name: Rating, dtype: float64

In [33]:
query_list_train[:5]

A00473363TJ8YSZ3YAGG9    1
A0301290UITIQ0E8OXP6     2
A08200421CXFIGYA5JKE9    1
A09946492JIP71Y3ES0Q0    1
A100WO06OQR8BQ           1
Name: UserId, dtype: int64

In [32]:
X_test.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,u_cnt_p,rating_5_p,rating_4_p,rating_3_p,rating_2_p,rating_1_p,act_mon_p,act_tue_p,act_wed_p,act_thu_p,...,rating_3_u,rating_2_u,rating_1_u,act_mon_u,act_tue_u,act_wed_u,act_thu_u,act_fri_u,act_sat_u,act_sun_u
UserId,ProductId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A031533437UL5KXSH8FNB,B00152C4TM,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
A031533437UL5KXSH8FNB,B005W5OAIG,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
A08823773EI284FSZWYMZ,B0085YA90O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A101577718CATXNFEYBQR,B00016XJ4M,30.0,23.0,6.0,1.0,0.0,0.0,6.0,3.0,6.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
A101BX7DD7EUZ1,B0069SC0OQ,9.0,5.0,2.0,1.0,0.0,1.0,3.0,0.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
A102JNFLL0KW7I,B00346H8T8,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0
A102JNFLL0KW7I,B003TJGNH8,3.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0
A102JNFLL0KW7I,B009P3DJZG,2.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0
A102P09AT3DX2E,B008PDK3XS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A103584DULBU65,B00KHH2VOY,91.0,82.0,8.0,1.0,0.0,0.0,12.0,23.0,19.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [31]:
list(query_list_test)[:5]

[2, 1, 1, 1, 3]

In [14]:
import lightgbm as lgb

model = lgb.LGBMRanker(n_estimators=1000, random_state=0)
model.fit(
    X_train,
    y_train,
    group=query_list_train,
    eval_set=[(X_test, y_test)],
    eval_group=[list(query_list_test)]
)

[1]	valid_0's ndcg@1: 0.94044	valid_0's ndcg@2: 0.96171	valid_0's ndcg@3: 0.969252	valid_0's ndcg@4: 0.973297	valid_0's ndcg@5: 0.975223
[2]	valid_0's ndcg@1: 0.940069	valid_0's ndcg@2: 0.961433	valid_0's ndcg@3: 0.970177	valid_0's ndcg@4: 0.973469	valid_0's ndcg@5: 0.975372
[3]	valid_0's ndcg@1: 0.941032	valid_0's ndcg@2: 0.961037	valid_0's ndcg@3: 0.970379	valid_0's ndcg@4: 0.973619	valid_0's ndcg@5: 0.975427
[4]	valid_0's ndcg@1: 0.941286	valid_0's ndcg@2: 0.962441	valid_0's ndcg@3: 0.97084	valid_0's ndcg@4: 0.973858	valid_0's ndcg@5: 0.975527
[5]	valid_0's ndcg@1: 0.938785	valid_0's ndcg@2: 0.961857	valid_0's ndcg@3: 0.969888	valid_0's ndcg@4: 0.973482	valid_0's ndcg@5: 0.975071
[6]	valid_0's ndcg@1: 0.940314	valid_0's ndcg@2: 0.961771	valid_0's ndcg@3: 0.969825	valid_0's ndcg@4: 0.973623	valid_0's ndcg@5: 0.975225
[7]	valid_0's ndcg@1: 0.940302	valid_0's ndcg@2: 0.962128	valid_0's ndcg@3: 0.969948	valid_0's ndcg@4: 0.973691	valid_0's ndcg@5: 0.975265
[8]	valid_0's ndcg@1: 0.938913

LGBMRanker(n_estimators=1000, random_state=0)

In [15]:
def predict_at_k(data, model, k):
    """
    関連度が上位kのアイテムを予測する
    """
    user_ids = list()
    product_ids = list()
    ranks = list()
    
    for userId, df in data.groupby('UserId'):
        
        pred = model.predict(df.loc[userId])
        productId = np.array(df.reset_index()['ProductId'])
        topK_index = np.argsort(pred)[::-1][:k]
        product_ids.extend(list(productId[topK_index]))
        user_ids.extend([userId]*len(topK_index))
        ranks.extend(list(range(1, len(topK_index)+1)))

    results = pd.DataFrame({'UserId': user_ids, 'ProductId': product_ids, 'Rating': ranks})
    
    return results

predicted = predict_at_k(X_test, model, 5)

In [21]:
X_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,u_cnt_p,rating_5_p,rating_4_p,rating_3_p,rating_2_p,rating_1_p,act_mon_p,act_tue_p,act_wed_p,act_thu_p,...,rating_3_u,rating_2_u,rating_1_u,act_mon_u,act_tue_u,act_wed_u,act_thu_u,act_fri_u,act_sat_u,act_sun_u
UserId,ProductId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A031533437UL5KXSH8FNB,B00152C4TM,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
A031533437UL5KXSH8FNB,B005W5OAIG,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
A08823773EI284FSZWYMZ,B0085YA90O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
A101577718CATXNFEYBQR,B00016XJ4M,30.0,23.0,6.0,1.0,0.0,0.0,6.0,3.0,6.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
A101BX7DD7EUZ1,B0069SC0OQ,9.0,5.0,2.0,1.0,0.0,1.0,3.0,0.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [24]:
pred = model.predict(X_test.loc["A031533437UL5KXSH8FNB"])

In [25]:
pred

array([1.31138742, 1.77320599])

In [16]:
sample = 'A2D7IHQGEBIDNG'
print('[predicted]')
print(predicted.query(f'UserId == "{sample}"')[['ProductId', 'Rating']])
print('+'*20)
print('[actual]')
print(y_test[sample].sort_values(ascending=False))

[predicted]
       ProductId  Rating
2117  B008X0LUSA       1
2118  B006JYMHW0       2
2119  B007V8VFEE       3
2120  B00D5TB1LK       4
2121  B00016XJ4M       5
++++++++++++++++++++
[actual]
ProductId
B00016XJ4M    5.0
B000W3QDJ2    5.0
B005SJSQIG    5.0
B006JYMHW0    5.0
B008X0LUSA    5.0
B00D5TB1LK    5.0
B000X1YING    4.0
B0071H5C76    4.0
B007V8VFEE    3.0
Name: Rating, dtype: float64
