In [75]:
import numpy as np
import pandas as pd
pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)
import matplotlib.pyplot as plt
from tqdm import tqdm

In [76]:
feed = pd.read_csv('feed_info.csv')
#feed_em = pd.read_csv('feed_embeddings.csv')
#submit = pd.read_csv('submit_demo_初赛a.csv')
test = pd.read_csv('test_a.csv')
action = pd.read_csv('user_action.csv')

# 构造特征与特征预处理
要用的已有特征: user_id, item_id, author_id, item_duration, device

构造统计特征: 统计用户和物品的评论数, 点赞数, 点击头像数, 转发数, 评论数, 关注数, 收藏数. {user, item}_{read, avatar, favor, forward, comment, follow, like}

## 重建id特征
把0空出来

In [77]:
user_id = action[['userid']].drop_duplicates()
user_id['user_id'] = np.arange(user_id.shape[0]) + 1
item_id = feed[['feedid']].drop_duplicates()
item_id['item_id'] = np.arange(item_id.shape[0]) + 1
feed = feed.merge(item_id, on='feedid')
action = action.merge(item_id, on='feedid')
action = action.merge(user_id, on='userid')
test = test.merge(item_id, on='feedid')
test = test.merge(user_id, on='userid')

## 构造item特征
- item_id, author_id, item_duration, 统计特征
- bgm_song_id和bgm_singer_id补全缺失

In [78]:
item = feed[['item_id', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']]

In [79]:
# 这两个类别特征先加1, 然后缺失为类别0
item['bgm_song_id'] = item['bgm_song_id'] + 1
item['bgm_singer_id'] = item['bgm_singer_id'] + 1
item['bgm_song_id'].fillna(0, inplace=True)
item['bgm_singer_id'].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [80]:
statis_feat = ['read_comment', 'comment', 'like', 'click_avatar', 'forward', 'follow', 'favorite']

In [81]:
item_statis = action.groupby('item_id').agg({'read_comment': 'sum', 'comment': 'sum', 'like': 'sum', 'click_avatar': 'sum', 'forward': 'sum', 'follow': 'sum', 'favorite': 'sum'}).reset_index()

In [82]:
item = pd.merge(item, item_statis, on='item_id', how='left')
item.fillna(0, inplace=True)  # 没有交互的视频统计特征均为0
item.isnull().any()

item_id             False
authorid            False
videoplayseconds    False
bgm_song_id         False
bgm_singer_id       False
read_comment        False
comment             False
like                False
click_avatar        False
forward             False
follow              False
favorite            False
dtype: bool

In [83]:
item.rename(columns={'read_comment': 'item_read', 'comment': 'item_comment', 'like': 'item_like', 'click_avatar': 'item_avatar', 'forward': 'item_forward', 'follow': 'item_follow', 'favorite': 'item_favor', 'bgm_song_id': 'item_song', 'bgm_singer_id': 'item_singer', 'authorid': 'author_id'}, inplace=True)

## 构造user特征
- user_id, 统计特征

In [84]:
user = action.groupby('user_id').agg({'read_comment': 'sum', 'comment': 'sum', 'like': 'sum', 'click_avatar': 'sum', 'forward': 'sum', 'follow': 'sum', 'favorite': 'sum'}).reset_index()
user.isnull().any()

user_id         False
read_comment    False
comment         False
like            False
click_avatar    False
forward         False
follow          False
favorite        False
dtype: bool

In [85]:
user.rename(columns={'read_comment': 'user_read', 'comment': 'user_comment', 'like': 'user_like', 'click_avatar': 'user_avatar', 'forward': 'user_forward', 'follow': 'user_follow', 'favorite': 'user_favor'}, inplace=True)

# 数据划分
将数据划分为训练集和验证集

把1-13天的数据作为训练集, 把第14天的数据作为验证集

In [86]:
# 划分数据集
#train = action.loc[action['date_']<14]
#validation = action.loc[action['date_']==14]
# 修改列名
#train = train[['user_id', 'item_id', 'device', 'read_comment', 'like', 'click_avatar', 'forward']]
#validation = validation[['user_id', 'item_id', 'device', 'read_comment', 'like', 'click_avatar', 'forward']]

In [87]:
#train.shape, validation.shape

# 数据存储

In [88]:
# 合并特征
#train = pd.merge(train, user, on='user_id', how='left')
#train = pd.merge(train, item, on='item_id', how='left')
#validation = pd.merge(validation, user, on='user_id', how='left')
#validation = pd.merge(validation, item, on='item_id', how='left')

In [89]:
# 修改列名
action = action[['user_id', 'date_', 'item_id', 'device', 'read_comment', 'like', 'click_avatar', 'forward']]
# 合并特征
action = pd.merge(action, user, on='user_id', how='left')
action = pd.merge(action, item, on='item_id', how='left')

In [90]:
test = test[['user_id', 'item_id', 'device']]
test = pd.merge(test, user, on='user_id', how='left')
test = pd.merge(test, item, on='item_id', how='left')

In [91]:
import pickle
with open('train.pkl', 'wb') as f:
    pickle.dump(action, f)
with open('test.pkl', 'wb') as f:
    pickle.dump(test, f)

In [109]:
with open('transform_id.pkl', 'wb') as f:
    pickle.dump((user_id, item_id), f)

In [92]:
user['user_id'].max(), user['user_id'].min(), user.shape

(20000, 1, (20000, 8))

In [93]:
item['item_id'].max(), item['item_id'].min(), item.shape

(106444, 1, (106444, 12))

In [94]:
action.head(10)

Unnamed: 0,user_id,date_,item_id,device,read_comment,like,click_avatar,forward,user_read,user_comment,user_like,user_avatar,user_forward,user_follow,user_favor,author_id,videoplayseconds,item_song,item_singer,item_read,item_comment,item_like,item_avatar,item_forward,item_follow,item_favor
0,1,1,31464,1,0,1,0,0,0,0,38,0,0,0,0,1528,11,13746.0,3557.0,29.0,0.0,25.0,2.0,1.0,0.0,0.0
1,1,1,35896,1,0,0,0,0,0,0,38,0,0,0,0,1442,16,0.0,0.0,111.0,0.0,59.0,5.0,2.0,1.0,1.0
2,1,1,38881,1,0,0,0,0,0,0,38,0,0,0,0,8648,31,0.0,0.0,18.0,1.0,37.0,13.0,5.0,1.0,1.0
3,1,1,7666,1,0,1,0,0,0,0,38,0,0,0,0,11976,6,13097.0,5013.0,8.0,0.0,34.0,1.0,2.0,0.0,1.0
4,1,1,20649,1,0,0,0,0,0,0,38,0,0,0,0,4370,12,22216.0,7900.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
5,1,1,30092,1,0,0,0,0,0,0,38,0,0,0,0,5607,16,9884.0,5419.0,9.0,0.0,15.0,1.0,0.0,0.0,1.0
6,1,1,28348,1,0,0,0,0,0,0,38,0,0,0,0,13941,19,21826.0,160.0,10.0,0.0,4.0,0.0,2.0,0.0,0.0
7,1,1,30493,1,0,0,0,0,0,0,38,0,0,0,0,12034,8,14619.0,11953.0,19.0,1.0,26.0,3.0,0.0,0.0,0.0
8,1,1,40152,1,0,0,0,0,0,0,38,0,0,0,0,12180,7,14572.0,16921.0,27.0,0.0,53.0,1.0,0.0,0.0,0.0
9,1,1,26966,1,0,0,0,0,0,0,38,0,0,0,0,13350,14,7580.0,15684.0,7.0,0.0,7.0,0.0,0.0,0.0,0.0


In [95]:
action[['user_id', 'item_id', 'item_song', 'item_singer', 'author_id']].describe()

Unnamed: 0,user_id,item_id,item_song,item_singer,author_id
count,7317882.0,7317882.0,7317882.0,7317882.0,7317882.0
mean,9991.04,53046.52,6180.49,4279.744,9525.217
std,5760.513,27621.36,8064.195,5544.116,5388.386
min,1.0,1.0,0.0,0.0,0.0
25%,5012.0,33983.0,0.0,0.0,4878.0
50%,9977.0,53529.5,0.0,0.0,9759.0
75%,14982.0,75373.0,12336.0,8339.0,14118.75
max,20000.0,105652.0,25159.0,17500.0,18788.0


In [102]:
res = pd.read_csv(r'../submit/0527-2333.csv')
res.drop(columns=['Unnamed: 0'], inplace=True)
res.head(10)

Unnamed: 0,user_id,item_id,read_comment,like,click_avatar,forward
0,1171,105975,5.194001e-08,0.014037,0.254897,0.110831
1,1171,82755,6.031253e-08,0.019095,0.362413,0.132947
2,1171,102732,5.556144e-08,0.008973,0.224509,0.052777
3,1171,98849,3.164365e-08,0.00737,0.238966,0.007775
4,1171,103779,8.085978e-08,0.004795,0.105686,0.00795
5,1171,104472,5.987388e-08,0.015198,0.199796,0.136231
6,1171,98935,6.817029e-08,0.012764,0.269183,0.019867
7,1171,12410,3.647946e-08,0.013571,0.59177,0.064228
8,1171,103404,3.466941e-08,0.010396,0.2939,0.033614
9,1171,92738,4.346478e-08,0.00837,0.13274,0.059726


In [103]:
res = pd.merge(res, item_id, on='item_id', how='left')
res = pd.merge(res, user_id, on='user_id', how='left')
res = res[['userid', 'feedid', 'read_comment', 'like', 'click_avatar', 'forward']]
res.head(10)

Unnamed: 0,userid,feedid,read_comment,like,click_avatar,forward
0,14298,67227,5.194001e-08,0.014037,0.254897,0.110831
1,14298,96268,6.031253e-08,0.019095,0.362413,0.132947
2,14298,52309,5.556144e-08,0.008973,0.224509,0.052777
3,14298,75871,3.164365e-08,0.00737,0.238966,0.007775
4,14298,49432,8.085978e-08,0.004795,0.105686,0.00795
5,14298,84382,5.987388e-08,0.015198,0.199796,0.136231
6,14298,57114,6.817029e-08,0.012764,0.269183,0.019867
7,14298,82425,3.647946e-08,0.013571,0.59177,0.064228
8,14298,87099,3.466941e-08,0.010396,0.2939,0.033614
9,14298,17686,4.346478e-08,0.00837,0.13274,0.059726


In [104]:
res_id = list(zip(res['userid'], res['feedid']))

In [105]:
demo = pd.read_csv(r'../submit/res.csv')
demo_id = list(zip(demo['userid'], demo['feedid']))

In [106]:
len(set(res_id)), len(set(demo_id))

(421985, 421985)

In [107]:
len(set(res_id) - set(demo_id))

0

In [108]:
res.to_csv(r'../submit/0527-2333.csv', index=False)