In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
feed = pd.read_csv('feed_info.csv')
#feed_em = pd.read_csv('feed_embeddings.csv')
#submit = pd.read_csv('submit_demo_初赛a.csv')
test = pd.read_csv('test_a.csv')
action = pd.read_csv('user_action.csv')

# 构造特征与特征预处理
要用的已有特征: user_id, item_id, author_id, item_duration, device

构造统计特征: 统计用户和物品的评论数, 点赞数, 点击头像数, 转发数, 评论数, 关注数, 收藏数. {user, item}_{read, avatar, favor, forward, comment, follow, like}

## 重建id特征
把0空出来

In [3]:
user_id = action[['userid']].drop_duplicates()
user_id['user_id'] = np.arange(user_id.shape[0]) + 1
item_id = feed[['feedid']].drop_duplicates()
item_id['item_id'] = np.arange(item_id.shape[0]) + 1
feed = feed.merge(item_id, on='feedid')
action = action.merge(item_id, on='feedid')
action = action.merge(user_id, on='userid')
test = test.merge(item_id, on='feedid')
test = test.merge(user_id, on='userid')

## 构造item特征
- item_id, author_id, item_duration, 统计特征

In [4]:
item = feed[['item_id', 'authorid', 'videoplayseconds']]

In [5]:
statis_feat = ['read_comment', 'comment', 'like', 'click_avatar', 'forward', 'follow', 'favorite']

In [6]:
item_statis = action.groupby('item_id').agg({'read_comment': 'sum', 'comment': 'sum', 'like': 'sum', 'click_avatar': 'sum', 'forward': 'sum', 'follow': 'sum', 'favorite': 'sum'}).reset_index()

In [7]:
item = pd.merge(item, item_statis, on='item_id', how='left')
item.fillna(0, inplace=True)  # 没有交互的视频统计特征均为0
item.isnull().any()

item_id             False
authorid            False
videoplayseconds    False
read_comment        False
comment             False
like                False
click_avatar        False
forward             False
follow              False
favorite            False
dtype: bool

In [8]:
item.rename(columns={'read_comment': 'item_read', 'comment': 'item_comment', 'like': 'item_like', 'click_avatar': 'item_avatar', 'forward': 'item_forward', 'follow': 'item_follow', 'favorite': 'item_favor'}, inplace=True)

## 构造user特征
- user_id, 统计特征

In [9]:
user = action.groupby('user_id').agg({'read_comment': 'sum', 'comment': 'sum', 'like': 'sum', 'click_avatar': 'sum', 'forward': 'sum', 'follow': 'sum', 'favorite': 'sum'}).reset_index()
user.isnull().any()

user_id         False
read_comment    False
comment         False
like            False
click_avatar    False
forward         False
follow          False
favorite        False
dtype: bool

In [10]:
user.rename(columns={'read_comment': 'user_read', 'comment': 'user_comment', 'like': 'user_like', 'click_avatar': 'user_avatar', 'forward': 'user_forward', 'follow': 'user_follow', 'favorite': 'user_favor'}, inplace=True)

# 数据划分
将数据划分为训练集和验证集

把1-13天的数据作为训练集, 把第14天的数据作为验证集

In [11]:
# 划分数据集
train = action.loc[action['date_']<14]
validation = action.loc[action['date_']==14]
# 修改列名
train = train[['user_id', 'item_id', 'device', 'read_comment', 'like', 'click_avatar', 'forward']]
validation = validation[['user_id', 'item_id', 'device', 'read_comment', 'like', 'click_avatar', 'forward']]

In [12]:
train.shape, validation.shape

((6708846, 7), (609036, 7))

# 数据存储

In [13]:
# 合并特征
train = pd.merge(train, user, on='user_id', how='left')
train = pd.merge(train, item, on='item_id', how='left')
validation = pd.merge(validation, user, on='user_id', how='left')
validation = pd.merge(validation, item, on='item_id', how='left')

In [14]:
test = test[['user_id', 'item_id', 'device']]
test = pd.merge(test, user, on='user_id', how='left')
test = pd.merge(test, item, on='item_id', how='left')

In [15]:
import pickle
with open('train.pkl', 'wb') as f:
    pickle.dump(train, f)
with open('test.pkl', 'wb') as f:
    pickle.dump(test, f)
with open('validation.pkl', 'wb') as f:
    pickle.dump(validation, f)

In [16]:
user['user_id'].max(), user['user_id'].min(), user.shape

(20000, 1, (20000, 8))

In [17]:
item['item_id'].max(), item['item_id'].min(), item.shape

(106444, 1, (106444, 10))