加载数据集

In [2]:
import pandas as pd
import numpy as np
import glob
from sklearn.preprocessing import LabelEncoder

In [3]:
train_goods = pd.concat([
    pd.read_csv('./训练集/traindata_goodsid/part-00000', header=None, names=['goods_id', 'cat_id', 'brandsn']),
    pd.read_csv('./训练集/traindata_goodsid/part-00001', header=None, names=['goods_id', 'cat_id', 'brandsn']),
    pd.read_csv('./训练集/traindata_goodsid/part-00002', header=None, names=['goods_id', 'cat_id', 'brandsn'])
], axis=0)

train_user = pd.concat([
    pd.read_csv(x, header=None, names=['user_id', 'goods_id', 'is_clk', 'is_like', 'is_addcart', 'is_order', 'expose_start_time', 'dt'], nrows=None)
    for x in glob.glob('./训练集/traindata_user/part*')
], axis=0)

In [4]:
testa_goods = pd.concat([
    pd.read_csv('./测试集a/predict_goods_id/part-00000', header=None, names=['goods_id', 'cat_id', 'brandsn']),
    pd.read_csv('./测试集a/predict_goods_id/part-00001', header=None, names=['goods_id', 'cat_id', 'brandsn']),
], axis=0)

testa_user = pd.read_excel('./测试集a/a榜需要预测的uid_5000.xlsx')

In [5]:
user_encode = LabelEncoder()
user_encode.fit(list(train_user['user_id']) + list(train_user['user_id']))

goods_encode = LabelEncoder()
goods_encode.fit(list(train_user['goods_id']) + list(train_goods['goods_id']) + list(train_goods['goods_id']))

In [6]:
np.mean(testa_user['user_id'].isin(train_user['user_id'])), np.mean(testa_goods['goods_id'].isin(train_goods['goods_id']))

(1.0, 0.9999751455447659)

数据分析

In [7]:
train_user['user_id'].nunique(), train_user['goods_id'].nunique()

(51602, 1531577)

In [8]:
train_user.describe().round(2)

Unnamed: 0,is_clk,is_like,is_addcart,is_order,dt
count,7791816.0,7791816.0,7791816.0,7791816.0,7791816.0
mean,1.06,0.02,0.06,0.01,20230222.85
std,0.28,0.13,0.27,0.09,26.21
min,1.0,0.0,0.0,0.0,20230203.0
25%,1.0,0.0,0.0,0.0,20230209.0
50%,1.0,0.0,0.0,0.0,20230216.0
75%,1.0,0.0,0.0,0.0,20230224.0
max,121.0,22.0,28.0,15.0,20230303.0


In [9]:
train_user['user_id'].value_counts()

94f8b77847a017324845b26592fdd964    15112
b83033e19eabcdd63ecc5d1cab156916     7913
ff9aa8c43066095cc67a5568ba5fb1c5     6453
485fbc41750e44e308f30683e4c8bbff     5582
c86e5bc3780bdb839b676d7505d15a49     5311
                                    ...  
f59919d2080053a4dfc516f200e4acce        1
c69ef88929430b67468a02abf73c8df3        1
bdf8350900cc5c08ce6753753213c00c        1
3127fb685010c92499d3a98455db440b        1
e305e481199cc9881f05ae7399c76463        1
Name: user_id, Length: 51602, dtype: int64

In [10]:
train_user.loc[(train_user['user_id'] == '71e1a59e90bc7174cf6349761217c627') & (train_user['goods_id'] == '47382b8a57e5b73bdba51de5c230fded')]

Unnamed: 0,user_id,goods_id,is_clk,is_like,is_addcart,is_order,expose_start_time,dt
26,71e1a59e90bc7174cf6349761217c627,47382b8a57e5b73bdba51de5c230fded,5,0,2,1,2023-02-19 06:58:28,20230219
20236,71e1a59e90bc7174cf6349761217c627,47382b8a57e5b73bdba51de5c230fded,1,0,0,0,2023-02-22 09:28:02,20230222


In [11]:
train_data = pd.merge(train_user.iloc[:], train_goods.iloc[:], on='goods_id')

In [12]:
train_data['cat_id'].nunique(), train_data['brandsn'].nunique()

(1616, 4711)

模型搭建

In [13]:
train_agg_feat = train_data.iloc[:].groupby(['user_id', 'goods_id']).agg({
    'is_clk': ['sum', 'max'],
    'is_like': ['sum', 'max'],
    'is_addcart': ['sum', 'max'],
    'is_order': ['sum', 'max'],
})

In [14]:
train_agg_feat = train_agg_feat.reset_index()
train_agg_feat.columns = [
    'user_id',
    'goods_id',
    'is_clk_sum',
    'is_clk_max',
    'is_like_sum',
    'is_like_max',
    'is_addcart_sum',
    'is_addcart_max',
    'is_order_sum',
    'is_order_max'
 ]

In [15]:
test_goods_id_agg = train_agg_feat.groupby('goods_id').agg({
    'is_clk_sum': 'sum',
    'is_order_sum': 'sum'
})
test_goods_id_agg = test_goods_id_agg[test_goods_id_agg['is_clk_sum'] > 100]
test_goods_id_agg = test_goods_id_agg[test_goods_id_agg['is_order_sum'] > 0]
test_valid_goods = test_goods_id_agg.index

In [16]:
train_feat_downsmaple = pd.concat([
    train_agg_feat[train_agg_feat['is_order_max'] !=0],
    train_agg_feat[train_agg_feat['is_order_max'] ==0].sample(int(0.03 * len(train_agg_feat)))
], axis=0)

In [17]:
train_feat_downsmaple.head()

Unnamed: 0,user_id,goods_id,is_clk_sum,is_clk_max,is_like_sum,is_like_max,is_addcart_sum,is_addcart_max,is_order_sum,is_order_max
390,000c81287b218ec3e2245a33464992fe,8b5a5db81fc5d86c0e4c9de7b8a96f72,1,1,0,0,1,1,1,1
394,000c81287b218ec3e2245a33464992fe,90224d8c0354a3b6383dcf400b45034f,1,1,0,0,1,1,1,1
670,0015421d3e9341bf4f1674f7dd4aa583,0632f9823ec79a93b4ddb9412fc9dcac,2,1,0,0,1,1,1,1
688,0015421d3e9341bf4f1674f7dd4aa583,1b41d6e717ee097750a14ae2b9b0b306,2,2,0,0,1,1,1,1
697,0015421d3e9341bf4f1674f7dd4aa583,21ddf62d1f2ec037a76f84599e16c1eb,3,3,1,1,0,0,1,1


In [18]:
1 - train_feat_downsmaple['is_order_max'].mean()
1 - train_feat_downsmaple['is_addcart_max'].mean()

0.7579848165819923

In [19]:
train_feat_downsmaple.shape

(258835, 10)

In [20]:
train_feat_downsmaple['user_id'] = user_encode.transform(train_feat_downsmaple['user_id'])
train_feat_downsmaple['goods_id'] = goods_encode.transform(train_feat_downsmaple['goods_id'])

In [21]:
import paddle
import paddle.nn as nn
from paddle.io import Dataset

class SelfDefinedDataset(Dataset):
    def __init__(self, df, mode = 'train'):
        super(SelfDefinedDataset, self).__init__()
        self.df = df
        self.mode = mode

    def __getitem__(self, idx):
        if self.mode == 'predict':
            return (
                self.df['user_id'].iloc[idx],
                self.df['goods_id'].iloc[idx],
                self.df['is_clk_max'].iloc[idx],
                self.df['is_like_max'].iloc[idx],
                self.df['is_addcart_max'].iloc[idx],
            )
        else:
            return (
                self.df['user_id'].iloc[idx],
                self.df['goods_id'].iloc[idx],
                self.df['is_clk_max'].iloc[idx],
                self.df['is_like_max'].iloc[idx],
                self.df['is_addcart_max'].iloc[idx],
                self.df['is_order_max'].iloc[idx]
            )

    def __len__(self):
        return len(self.df)

In [81]:
EMBEDDING_SIZE = 256

# 定义深度学习模型
class RecommenderNet(nn.Layer):
    def __init__(self, num_users, num_goods, embedding_size, numeric_size):
        super(RecommenderNet, self).__init__()
        self.num_users = num_users
        self.num_goods = num_goods
        self.embedding_size = embedding_size
        self.dropout = nn.Dropout(p=0.52)  # 设置丢弃的概率为 0.5

        weight_attr_user = paddle.ParamAttr(
            regularizer = paddle.regularizer.L2Decay(1e-6),
            initializer = nn.initializer.KaimingNormal()
            )
        self.user_embedding = nn.Embedding(
            num_users,
            embedding_size,
            weight_attr=weight_attr_user
        )

        weight_attr_goods = paddle.ParamAttr(
            regularizer=paddle.regularizer.L2Decay(1e-6),
            initializer=nn.initializer.KaimingNormal()
        )
        self.goods_embedding = nn.Embedding(
            num_goods,
            embedding_size,
            weight_attr=weight_attr_goods
        )

        self.linear = nn.Linear(2 * embedding_size, 1)

    def forward(self, data):
        # user, goods, feat = data[0], data[1], [data[idx] for idx in [2, 3,4]]
        user, goods, feat = data[0], data[1], [data[idx] for idx in range(2, len(data))]
        feat = paddle.stack(feat, 1).astype(paddle.float32)

        user_vector = self.user_embedding(user)
        goods_vector = self.goods_embedding(goods)

        x = paddle.concat([user_vector, goods_vector], axis=1)
        x = self.dropout(x)  # 添加 Dropout 层
        x = paddle.nn.functional.relu(x)  # 添加ReLU激活函数

        return self.linear(x)

In [82]:
# 定义模型损失函数、优化器和评价指标
model = RecommenderNet(len(user_encode.classes_), len(goods_encode.classes_), 128, 3)
optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.0006)
# optimizer = paddle.optimizer.SGD(parameters=model.parameters(), learning_rate=0.01)
loss_fn = nn.BCEWithLogitsLoss()

模型训练与预测

In [83]:
# 模型训练
Train_Loss = []
best_val_loss = float('inf')
patience = 3
counter = 0

for epoch in range(5):
    train_feat_downsmaple = train_feat_downsmaple.sample(frac=1.0)

    traindataset = SelfDefinedDataset(train_feat_downsmaple.iloc[:-int(-0.2*len(train_feat_downsmaple))])
    train_loader = paddle.io.DataLoader(traindataset, batch_size = 128, shuffle = True)

    validdataset = SelfDefinedDataset(train_feat_downsmaple.iloc[-int(-0.2*len(train_feat_downsmaple)):])
    valid_loader = paddle.io.DataLoader(validdataset, batch_size = 128, shuffle = True)

    model.train()

    for i, data in enumerate(train_loader):
        pred = model(data)
        loss = loss_fn(data[-1].astype(paddle.float32), pred.flatten())
        Train_Loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

    # 模型验证
    Val_acc = []
    model.eval()
    with paddle.no_grad():
        valid_loss = 0
        for i, data in enumerate(valid_loader):
            pred = model(data)
            Val_acc.append(((pred.flatten() > 0).astype(paddle.int64) == data[-1]).sum().item())
            valid_loss += loss_fn(data[-1].astype(paddle.float32), pred.flatten()).item()
        valid_loss /= len(valid_loader)
        # Early Stopping
        if valid_loss < best_val_loss:
            best_val_loss = valid_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered.")
                break
    print(epoch, np.sum(Val_acc) / len(valid_loader.dataset))

0 0.25498387003303263
1 0.25504665134158827
2 0.2549404060501864
Early stopping triggered.


In [84]:
test_valid_goods = test_valid_goods[test_valid_goods.isin(test_valid_goods)]

test_feat_agg = []
for user in testa_user['user_id'].values:
    for goods in test_valid_goods:
        test_feat_agg.append([user, goods])

In [85]:
test_feat_agg = pd.DataFrame(test_feat_agg)
test_feat_agg.columns = ['user_id', 'goods_id']

In [86]:
test_feat_agg = pd.merge(test_feat_agg, train_agg_feat, on=['user_id', 'goods_id'])

In [87]:
test_feat_agg['user_id'] = user_encode.transform(test_feat_agg['user_id'])
test_feat_agg['goods_id'] = goods_encode.transform(test_feat_agg['goods_id'])

In [88]:
testdataset = SelfDefinedDataset(test_feat_agg)
test_loader = paddle.io.DataLoader(testdataset, batch_size = 128, shuffle = False)

In [89]:
model.eval()

test_pred = []
with paddle.no_grad():
    for i, data in enumerate(test_loader):
        pred = model(data)
        test_pred += list((paddle.nn.functional.sigmoid(pred) > 0.5).numpy().reshape(-1))

In [90]:
test_feat_agg['pred'] = test_pred

In [91]:
test_feat_agg = test_feat_agg[test_feat_agg['pred']]

In [92]:
test_feat_agg['user_id'] = user_encode.inverse_transform(test_feat_agg['user_id'])
test_feat_agg['goods_id'] = goods_encode.inverse_transform(test_feat_agg['goods_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_feat_agg['user_id'] = user_encode.inverse_transform(test_feat_agg['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_feat_agg['goods_id'] = goods_encode.inverse_transform(test_feat_agg['goods_id'])


In [93]:
test_feat_agg[['user_id', 'goods_id']].to_csv('u2i.csv', index=None, header=None)

In [94]:
import csv

# 定义要添加的列名
new_columns = ["user_id", "goods_id"]

# 打开原始CSV文件并读取数据
with open("u2i.csv", "r") as file:
    reader = csv.reader(file)
    data = list(reader)

# 在第一行插入新的列名
data[0] = new_columns

# 将更新后的数据写入新的CSV文件
with open("u2i_updated.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(data)

print("列已成功添加到第一行！")

列已成功添加到第一行！
