In [87]:
import pandas as pd
import numpy as np

In [88]:
# 导入数据
data = pd.read_csv('douyin_dataset.csv',index_col=0)
data.head()

Unnamed: 0,uid,user_city,item_id,author_id,item_city,channel,finish,like,music_id,duration_time,real_time,H,date
3,15692,109.0,691661,18212,213.0,0,0,0,11513.0,10,2019-10-28 21:55:10,21,2019-10-28
5,44071,80.0,1243212,34500,68.0,0,0,0,1274.0,9,2019-10-21 22:27:03,22,2019-10-21
16,10902,202.0,3845855,634066,113.0,0,0,0,762.0,10,2019-10-26 00:38:51,0,2019-10-26
19,25300,21.0,3929579,214923,330.0,0,0,0,2332.0,15,2019-10-25 20:36:25,20,2019-10-25
24,3656,138.0,2572269,182680,80.0,0,0,0,238.0,9,2019-10-21 20:46:29,20,2019-10-21


# 构造特征

## 用户特征

In [89]:
user_df = pd.DataFrame()

### 用户点赞量

In [90]:
# 把行索引提取，转成列表，并存入 user_df 里面
user_df['uid'] = data.groupby("uid")['like'].count().index.tolist()
# 把用户id设置为行索引
user_df.set_index("uid",inplace=True)
# 把用户的点赞数加进来
user_df['用户点赞数'] = data.groupby("uid")['like'].sum()
user_df.value_counts()

用户点赞数
0        54114
1         2756
2          926
3          447
4          223
         ...  
47           1
46           1
45           1
44           1
183          1
Length: 67, dtype: int64

### 用户浏览量

In [91]:
user_df['用户浏览量'] = data.groupby("uid")['like'].count()

### 用户完整观看数

In [92]:
user_df['用户完整观看数'] = data.groupby("uid")['finish'].sum()

### 用户观看的不同作品数

In [93]:
# 对整体数据进行分组，然后对指定列进行计算
user_df['用户观看作品数'] = data.groupby("uid")['item_id'].nunique()

### 用户观看的作者数 

In [94]:
user_df['用户观看作者数'] = data.groupby("uid")['author_id'].nunique()

### 用户观看作品的平均时长

In [95]:
user_df['用户观看作品的平均时长'] = data.groupby("uid")['duration_time'].mean()

### 用户观看配乐数

In [96]:
user_df['用户观看配乐数'] = data.groupby("uid")['music_id'].nunique()

### 用户去过的城市数

In [97]:
user_df['用户去过的城市'] = data.groupby("uid")['user_city'].nunique()

### 观看作品城市数

In [98]:
user_df['用户观看作品城市数'] = data.groupby("uid")['item_city'].nunique()

## 分析作者特征

In [99]:
author_df = pd.DataFrame()

In [100]:
# 提取行索引
author_df['author_id'] = data.groupby("author_id")['like'].count().index.tolist()
# 设置行索引
author_df.set_index("author_id",inplace=True)

### 作者创作活跃日

In [101]:
data['date'] = pd.to_datetime(data['date'])
def func(x):
    return (x.max() - x.min()).days+1
author_df['作者创作活跃日'] = data.groupby("author_id").agg({'date':func})


### 作者发布作品的日数

In [102]:
author_df['作者发布作品日数'] = data.groupby("author_id")['real_time'].nunique()

### 作者去过的城市数

In [103]:
author_df['作者去过的城市数'] = data.groupby("author_id")['user_city'].nunique()

### 作者使用配乐数量

In [104]:
author_df['作者使用配乐数量'] = data.groupby("author_id")['music_id'].nunique()

### 作者总浏览量

In [105]:
author_df['作者总浏览量'] = data.groupby("author_id")['like'].count()

### 作者总点赞数

In [106]:
author_df['作者总点赞数'] = data.groupby("author_id")['like'].sum()

### 作者总观看数

In [107]:
author_df['作者总观看数'] = data.groupby("author_id")['finish'].sum()

### 作者总作品数

In [108]:
author_df['作者总作品数'] = data.groupby("author_id")['item_id'].nunique()

### 作者作品平均时长

In [109]:
author_df['作者作品平均时长'] = data.groupby("author_id")['duration_time'].mean()

## 分析作品特征

In [110]:
item_df = pd.DataFrame()
item_df['item_df'] = data.groupby("item_id")['like'].count().index.tolist()
item_df.set_index("item_df",inplace=True)

### 作品点赞量

In [111]:
item_df['作品点赞数'] = data.groupby("item_id")['like'].sum()

### 作品浏览数

In [112]:
item_df['作品浏览数'] = data.groupby("item_id")['like'].count()

## 拼接数据

In [113]:
# 合并数据
data1 = pd.merge(data,user_df,left_on='uid',right_index=True,how='left')
data2 = pd.merge(data1,author_df,left_on='author_id',right_index=True,how='left')
data3 = pd.merge(data2,item_df,left_on='item_id',right_index=True,how='left')

In [114]:
len(data3.columns)

33

## 构建模型

In [115]:
data3['like'].value_counts()

0    1720539
1      16773
Name: like, dtype: int64

### 样本均衡

In [116]:
n = 10000
data3_T = data3[data3['like']==1].sample(n)
data3_F = data3[data3['like']==0].sample(n)
data3_clear = pd.concat([data3_T,data3_F])
data3_clear.shape

(20000, 33)

### 特征提取

In [117]:
Y = data3_clear['like']
del data3_clear['like'],data3_clear['date'],data3_clear['finish'],data3_clear['channel'],data3_clear['real_time']
X = data3_clear

### 检查特征空值

In [120]:
for i in X:
    if sum(X[i].isnull()) != 0:
        print(f'{i}有空数据')
else:
    print('所有数据均没有空值')

所有数据均没有空值


### 标准化

In [121]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
X = scaler.fit_transform(X)

### 数据分割

In [123]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,)

### 建模预测

In [124]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train,Y_train)
pre = model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(Y_test,pre)

0.97