In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Adjust pandas to display comma as thousands separator
pd.options.display.float_format = '{:,}'.format

# item_daily_features.csv

Key fields and potential use:
1. video_id
2. author_id
3. video_type
    - Distinguishes between normal videos and ads. You can filter ads out of recommendations or balance them according to the user’s interaction preferences with both types.
4. upload_dt / date
    - Helps track freshness and recency of content, prioritize newer content or trending videos.
5. video_duration
    - Videos with different durations might appeal to different users
6. video_tag_id / video_tag_name
    - content-based filtering (recommending videos with similar tags).
7. show_cnt, play_cnt, complete_play_cnt
    - popularity filtering. Videos with higher completion rates signal higher user satisfaction.
    - Can calculate some new feature e.g. Complete Play Rate
8. valid_play_cnt (/ valid_play_user_num)
    - More refined measures of engagement that discount partial or irrelevant views, useful for calculating true user interest in a video.
    - Can calculate some new feature e.g. Valid Play Rate   
9. like_cnt, comment_cnt, follow_cnt, share_cnt
    - Reflect explicit feedback and engagement, which is valuable for identifying user preferences.
    - Can be used as features in a ranking model
10. play_progress
    - Higher play_progress indicates higher engagement
11. visible_status
    - Ensures that recommendations only include currently visible (public) videos.

In [15]:
# Data is one level up in the folder structure
rootpath = "../KuaiRec 2.0/"

# Load videos and its features
item_features_filepath = rootpath + "data/item_daily_features.csv"
item_daily_features = pd.read_csv(item_features_filepath)

# Load translated captions
item_captions_filepath = rootpath + "data/kuairec_caption_category_translated.csv"
translated_captions = pd.read_csv(item_captions_filepath)

print(f'Total number of items: {len(item_daily_features)}')

Total number of items: 343341


In [16]:
translated_captions.columns

Index(['video_id', 'manual_cover_text', 'caption', 'topic_tag',
       'first_level_category_id', 'first_level_category_name',
       'second_level_category_id', 'second_level_category_name',
       'third_level_category_id', 'third_level_category_name',
       'english_caption', 'english_first_level_category_name',
       'english_second_level_category_name',
       'english_third_level_category_name'],
      dtype='object')

### Data Type conversion

In [17]:
item_daily_features['date'] = pd.to_datetime(item_daily_features['date'], format='%Y%m%d')
item_daily_features['upload_dt'] = pd.to_datetime(item_daily_features['date'], format='%Y-%m-%d')
item_daily_features['video_id'] = item_daily_features['video_id'].astype(str)

translated_captions['video_id'] = translated_captions['video_id'].astype(str)

### Merge in video category from translated captions

In [44]:
# Fill NA categories
translated_captions['english_first_level_category_name'] = translated_captions['english_first_level_category_name'].fillna('None')
translated_captions['english_second_level_category_name'] = translated_captions['english_second_level_category_name'].fillna('None')
translated_captions['english_third_level_category_name'] = translated_captions['english_third_level_category_name'].fillna('None')

# Merge in captions
video_categories = translated_captions[['video_id', 'english_first_level_category_name', 
                                        'english_second_level_category_name', 'english_third_level_category_name'
                                        ]]
video_features = item_daily_features.merge(video_categories, on='video_id', how='left')

### Complete Play Rate

In [45]:
video_features['complete_play_rate'] = video_features['complete_play_cnt'] / video_features['play_cnt']

### Valid Play Rate

In [46]:
video_features['valid_play_rate'] = video_features['valid_play_cnt'] / video_features['play_cnt']

### Like Rate

In [47]:
video_features['like_rate'] = video_features['like_cnt'] / video_features['play_cnt']

### Total Engagement

In [48]:
video_features['total_engagement'] = video_features['like_cnt'] + video_features['comment_cnt'] + video_features['share_cnt']

### Comment Rate

In [49]:
# item_daily_features['comment_rate'] = item_daily_features['comment_cnt'] / item_daily_features['play_cnt']

### Follow rate, Share rate

In [50]:
# item_daily_features_public['follow_rate'] = item_daily_features_public['follow_cnt'] / item_daily_features_public['play_cnt']
# item_daily_features_public['share_rate'] = item_daily_features_public['share_cnt'] / item_daily_features_public['play_cnt']

### Rank topics by engagement

In [58]:
# Get ranking of categories by total engagement
first_level_category_engagement = video_features.groupby('english_first_level_category_name')['total_engagement'].sum().sort_values(ascending=False)
first_level_category_engagement = first_level_category_engagement.reset_index()
first_level_category_engagement = first_level_category_engagement[first_level_category_engagement['english_first_level_category_name'] != 'UNKNOWN'].reset_index()
first_level_category_engagement['first_level_category_rank'] = first_level_category_engagement.index + 1

# Calculate rankings for second level categories
second_level_category_engagement = video_features.groupby('english_second_level_category_name')['total_engagement'].sum().sort_values(ascending=False)
second_level_category_engagement = second_level_category_engagement.reset_index()
second_level_category_engagement = second_level_category_engagement[second_level_category_engagement['english_second_level_category_name'] != 'UNKNOWN'].reset_index()
second_level_category_engagement['second_level_category_rank'] = second_level_category_engagement.index + 1

# Calculate rankings for third level categories
third_level_category_engagement = video_features.groupby('english_third_level_category_name')['total_engagement'].sum().sort_values(ascending=False)
third_level_category_engagement = third_level_category_engagement.reset_index()
third_level_category_engagement = third_level_category_engagement[third_level_category_engagement['english_third_level_category_name'] != 'UNKNOWN'].reset_index()
third_level_category_engagement['third_level_category_rank'] = third_level_category_engagement.index + 1

# Merge rankings back into the main dataframe
video_features_ranked = video_features.merge(first_level_category_engagement[['english_first_level_category_name', 'first_level_category_rank']], 
                                             on='english_first_level_category_name', how='left')
video_features_ranked = video_features_ranked.merge(second_level_category_engagement[['english_second_level_category_name', 'second_level_category_rank']], 
                                                    on='english_second_level_category_name', how='left')
video_features_ranked = video_features_ranked.merge(third_level_category_engagement[['english_third_level_category_name', 'third_level_category_rank']], 
                                                    on='english_third_level_category_name', how='left')

# Fill NA rankings with -1
video_features_ranked['first_level_category_rank'] = video_features_ranked['first_level_category_rank'].fillna(-1)
video_features_ranked['second_level_category_rank'] = video_features_ranked['second_level_category_rank'].fillna(-1)
video_features_ranked['third_level_category_rank'] = video_features_ranked['third_level_category_rank'].fillna(-1)

In [59]:
video_features_ranked['first_level_category_rank'] = video_features_ranked['first_level_category_rank'].astype(int)
video_features_ranked['second_level_category_rank'] = video_features_ranked['second_level_category_rank'].astype(int)
video_features_ranked['third_level_category_rank'] = video_features_ranked['third_level_category_rank'].astype(int)

video_features_ranked[['video_id', 'english_first_level_category_name', 'english_second_level_category_name', 'english_third_level_category_name', 
                       'total_engagement', 'first_level_category_rank', 'second_level_category_rank', 'third_level_category_rank']].head()

Unnamed: 0,video_id,english_first_level_category_name,english_second_level_category_name,english_third_level_category_name,total_engagement,first_level_category_rank,second_level_category_rank,third_level_category_rank
0,0,Beauty index,Snap of good looks,UNKNOWN,586,5,5,-1
1,0,Beauty index,Snap of good looks,UNKNOWN,310,5,5,-1
2,0,Beauty index,Snap of good looks,UNKNOWN,210,5,5,-1
3,0,Beauty index,Snap of good looks,UNKNOWN,303,5,5,-1
4,0,Beauty index,Snap of good looks,UNKNOWN,312,5,5,-1


### Summary Statistics

In [61]:
video_features_ranked[['complete_play_rate', 'valid_play_rate', 'like_rate']].describe()

Unnamed: 0,complete_play_rate,valid_play_rate,like_rate
count,320250.0,320250.0,320254.0
mean,0.4003619374344328,0.4842118435722959,inf
std,0.2196635904372121,0.217097191120119,
min,0.0,0.0,0.0
25%,0.25,0.3529411764705882,0.0
50%,0.409039555539718,0.5045877047777703,0.0129986405352476
75%,0.5490196078431373,0.6365094268903416,0.0323138218106763
max,1.0,1.0,inf


In [64]:
video_features_ranked[['first_level_category_rank', 'second_level_category_rank', 'third_level_category_rank']].describe()

Unnamed: 0,first_level_category_rank,second_level_category_rank,third_level_category_rank
count,343341.0,343341.0,343341.0
mean,11.926720082949604,17.07682158553741,17.51892433469932
std,8.850229896661649,26.844964674366278,40.682704552753286
min,-1.0,-1.0,-1.0
25%,5.0,-1.0,-1.0
50%,10.0,4.0,-1.0
75%,17.0,23.0,13.0
max,40.0,141.0,221.0
