In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
from sklearn.decomposition import PCA

In [2]:
feed = pd.read_csv('feed_info.csv')
#feed_em = pd.read_csv('feed_embeddings.csv')
#submit = pd.read_csv('submit_demo_初赛a.csv')
test = pd.read_csv('test_a.csv')
action = pd.read_csv('user_action.csv')

# 构造特征与特征预处理
要用的已有特征: user_id, item_id, author_id, item_duration, device

构造统计特征: 统计用户和物品的评论数, 点赞数, 点击头像数, 转发数, 评论数, 关注数, 收藏数. {user, item}_{read, avatar, favor, forward, comment, follow, like}

## 重建id特征
把0空出来

In [3]:
user_id = action[['userid']].drop_duplicates()
user_id['user_id'] = np.arange(user_id.shape[0]) + 1
item_id = feed[['feedid']].drop_duplicates()
item_id['item_id'] = np.arange(item_id.shape[0]) + 1
feed = feed.merge(item_id, on='feedid')
action = action.merge(item_id, on='feedid')
action = action.merge(user_id, on='userid')
test = test.merge(item_id, on='feedid')
test = test.merge(user_id, on='userid')

## 构造item特征
- item_id, author_id, item_duration, 统计特征
- bgm_song_id和bgm_singer_id补全缺失
- item_ocr和item_id是完全相同的

In [4]:
item = feed[['item_id', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id']]

In [5]:
# 这两个类别特征先加1, 然后缺失为类别0
item['bgm_song_id'] = item['bgm_song_id'] + 1
item['bgm_singer_id'] = item['bgm_singer_id'] + 1
item['bgm_song_id'].fillna(0, inplace=True)
item['bgm_singer_id'].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [6]:
statis_feat = ['read_comment', 'comment', 'like', 'click_avatar', 'forward', 'follow', 'favorite']

In [7]:
item_statis = action.loc[action['date_']<14].groupby('item_id').agg({'read_comment': 'sum', 'comment': 'sum', 'like': 'sum', 'click_avatar': 'sum', 'forward': 'sum', 'follow': 'sum', 'favorite': 'sum'}).reset_index()

In [8]:
item = pd.merge(item, item_statis, on='item_id', how='left')
item.fillna(0, inplace=True)  # 没有交互的视频统计特征均为0
item.isnull().any()

item_id             False
authorid            False
videoplayseconds    False
bgm_song_id         False
bgm_singer_id       False
read_comment        False
comment             False
like                False
click_avatar        False
forward             False
follow              False
favorite            False
dtype: bool

In [9]:
item.rename(columns={'read_comment': 'item_read', 'comment': 'item_comment', 'like': 'item_like', 'click_avatar': 'item_avatar', 'forward': 'item_forward', 'follow': 'item_follow', 'favorite': 'item_favor', 'bgm_song_id': 'item_song', 'bgm_singer_id': 'item_singer', 'authorid': 'author_id', 'videoplayseconds': 'item_seconds'}, inplace=True)

In [10]:
item['item_ocr'] = item['item_id']

In [11]:
item['item_seconds'].loc[item['item_seconds']>62].value_counts()

6275     1
29596    1
25931    1
7916     1
8963     1
13931    1
11068    1
25200    1
9649     1
272      1
25172    1
10275    1
59960    1
10137    1
30620    1
Name: item_seconds, dtype: int64

In [12]:
# 由于大于62的不多, 且每一种都只有一个. 因此简单处理把62以上的都当作63
item['item_seconds'].loc[item['item_seconds']>62] = 63

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## 构造user特征
- user_id, 统计特征

In [13]:
user = action.loc[action['date_']<14].groupby('user_id').agg({'read_comment': 'sum', 'comment': 'sum', 'like': 'sum', 'click_avatar': 'sum', 'forward': 'sum', 'follow': 'sum', 'favorite': 'sum'}).reset_index()
user.isnull().any()

user_id         False
read_comment    False
comment         False
like            False
click_avatar    False
forward         False
follow          False
favorite        False
dtype: bool

In [14]:
user.rename(columns={'read_comment': 'user_read', 'comment': 'user_comment', 'like': 'user_like', 'click_avatar': 'user_avatar', 'forward': 'user_forward', 'follow': 'user_follow', 'favorite': 'user_favor'}, inplace=True)

In [15]:
user.describe()

Unnamed: 0,user_id,user_read,user_comment,user_like,user_avatar,user_forward,user_follow,user_favor
count,19916.0,19916.0,19916.0,19916.0,19916.0,19916.0,19916.0,19916.0
mean,9999.824061,11.831944,0.136272,8.747891,2.540068,1.300562,0.241916,0.459882
std,5773.920517,38.502302,1.617724,21.956016,5.37658,4.302081,1.133394,5.08188
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4999.75,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,9999.5,0.0,0.0,3.0,1.0,0.0,0.0,0.0
75%,14999.25,4.0,0.0,8.0,3.0,1.0,0.0,0.0
max,20000.0,587.0,164.0,781.0,229.0,251.0,32.0,492.0


# 数据划分
将数据划分为训练集和验证集

把1-13天的数据作为训练集, 把第14天的数据作为验证集

In [16]:
# 划分数据集
#train = action.loc[action['date_']<14]
#validation = action.loc[action['date_']==14]
# 修改列名
#train = train[['user_id', 'item_id', 'device', 'read_comment', 'like', 'click_avatar', 'forward']]
#validation = validation[['user_id', 'item_id', 'device', 'read_comment', 'like', 'click_avatar', 'forward']]

In [17]:
#train.shape, validation.shape

# 数据存储

In [18]:
# 合并特征
#train = pd.merge(train, user, on='user_id', how='left')
#train = pd.merge(train, item, on='item_id', how='left')
#validation = pd.merge(validation, user, on='user_id', how='left')
#validation = pd.merge(validation, item, on='item_id', how='left')

In [19]:
import math
action['play'] = action['play'].apply(lambda x: math.log(x+2))
action['stay'] = action['stay'].apply(lambda x: math.log(x+2))

In [20]:
# 修改列名
action = action[['user_id', 'date_', 'item_id', 'device', 'read_comment', 'like', 'click_avatar', 'forward', 'stay', 'play']]
# 合并特征
action = pd.merge(action, user, on='user_id', how='left')
action = pd.merge(action, item, on='item_id', how='left')

In [21]:
test = test[['user_id', 'item_id', 'device']]
test = pd.merge(test, user, on='user_id', how='left')
test = pd.merge(test, item, on='item_id', how='left')

In [22]:
with open('train.pkl', 'wb') as f:
    pickle.dump(action, f)
with open('test.pkl', 'wb') as f:
    pickle.dump(test, f)

In [23]:
with open('transform_id.pkl', 'wb') as f:
    pickle.dump((user_id, item_id), f)

In [24]:
user['user_id'].max(), user['user_id'].min(), user.shape

(20000, 1, (19916, 8))

In [23]:
item['item_id'].max(), item['item_id'].min(), item.shape

(106444, 1, (106444, 13))

In [25]:
action.head(10)

Unnamed: 0,user_id,date_,item_id,device,read_comment,like,click_avatar,forward,user_read,user_comment,user_like,user_avatar,user_forward,user_follow,user_favor,author_id,item_seconds,item_song,item_singer,item_read,item_comment,item_like,item_avatar,item_forward,item_follow,item_favor,item_ocr
0,1,1,31464,1,0,1,0,0,0,0,38,0,0,0,0,1528,11,13746.0,3557.0,29.0,0.0,25.0,2.0,1.0,0.0,0.0,31464
1,1,1,35896,1,0,0,0,0,0,0,38,0,0,0,0,1442,16,0.0,0.0,111.0,0.0,59.0,5.0,2.0,1.0,1.0,35896
2,1,1,38881,1,0,0,0,0,0,0,38,0,0,0,0,8648,31,0.0,0.0,18.0,1.0,37.0,13.0,5.0,1.0,1.0,38881
3,1,1,7666,1,0,1,0,0,0,0,38,0,0,0,0,11976,6,13097.0,5013.0,8.0,0.0,34.0,1.0,2.0,0.0,1.0,7666
4,1,1,20649,1,0,0,0,0,0,0,38,0,0,0,0,4370,12,22216.0,7900.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,20649
5,1,1,30092,1,0,0,0,0,0,0,38,0,0,0,0,5607,16,9884.0,5419.0,9.0,0.0,15.0,1.0,0.0,0.0,1.0,30092
6,1,1,28348,1,0,0,0,0,0,0,38,0,0,0,0,13941,19,21826.0,160.0,10.0,0.0,4.0,0.0,2.0,0.0,0.0,28348
7,1,1,30493,1,0,0,0,0,0,0,38,0,0,0,0,12034,8,14619.0,11953.0,19.0,1.0,26.0,3.0,0.0,0.0,0.0,30493
8,1,1,40152,1,0,0,0,0,0,0,38,0,0,0,0,12180,7,14572.0,16921.0,27.0,0.0,53.0,1.0,0.0,0.0,0.0,40152
9,1,1,26966,1,0,0,0,0,0,0,38,0,0,0,0,13350,14,7580.0,15684.0,7.0,0.0,7.0,0.0,0.0,0.0,0.0,26966


## 处理feed_embedding, 将字符串转换为numpy进行存储

In [27]:
feed_em = pd.read_csv('feed_embeddings.csv')
# 处理每一行, 将字符串转换为数字
feed_em['item_ocr'] = feed_em['feed_embedding'].apply(lambda x: np.array(x.split(' ')[0:512]).astype(np.float64).tolist())

In [28]:
pretrain = feed_em[['feedid', 'item_ocr']]
pretrain = pd.merge(pretrain, item_id, on='feedid', how='left')
pretrain = pretrain[['item_id', 'item_ocr']]
pretrain.sort_values(by='item_id', ascending=True, inplace=True)
pretrain.reset_index(drop=True, inplace=True)
pretrain.head(10)

Unnamed: 0,item_id,item_ocr
0,1,"[-0.00509984, -0.0590496, -0.01864357, -0.0142..."
1,2,"[0.02817863, -0.02137377, 0.06678647, 0.014733..."
2,3,"[-0.01880374, 0.03276707, 0.09490133, 0.031316..."
3,4,"[-0.05007412, -0.11540501, 0.04330789, -0.0187..."
4,5,"[0.05636294, 0.01753669, 0.01241871, 0.0152164..."
5,6,"[0.07747091, 0.02647814, 0.0786632, -0.0200964..."
6,7,"[-0.01297182, -0.01348432, 0.05005981, 0.00076..."
7,8,"[-0.0552958, -0.04448033, 0.02940514, 0.081417..."
8,9,"[-0.01446141, -0.00452141, 0.0790388, 0.013322..."
9,10,"[-0.02395611, -0.01667979, 0.09536444, 0.00409..."


In [29]:
pretrain['item_id'].max()

106444

In [30]:
# 先转换为list, 再全部转换为numpy
temp = pretrain['item_ocr'].values.tolist()
temp = np.array(temp)
temp.shape

(106444, 512)

In [31]:
pre = np.insert(temp, 0, np.zeros(512), axis=0)
pre[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [32]:
pre.shape

(106445, 512)

In [33]:
# 进行PCA降维
n_dim = 32
transformer = PCA(n_components=n_dim)
pre = transformer.fit_transform(pre)

In [34]:
with open(r'ocr_embedding_' + str(n_dim) + '.pkl', 'wb') as f:
    pickle.dump(pre, f)