In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.width', None)        # Width of the display in characters
pd.set_option('display.max_colwidth', None) # Show full content of each 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
joined_train_data = pd.read_csv('data_exports/joined_train_data.csv')

In [3]:
joined_train_data.columns

Index(['user_id', 'video_id', 'time', 'watch_ratio', 'user_active_degree',
       'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
       'author_id', 'video_type', 'video_tag_name', 'video_duration',
       'show_cnt', 'play_cnt', 'play_duration', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'manual_cover_text',
       'caption', 'topic_tag', 'first_level_category_name',
       'second_level_category_name', 'third_level_category_name',
       'english_caption', 'english_first_level_category_name',
       'english_second_level_category_name',
       'english_third_level_category_name', 'english_topic_tag'],
      dtype='object')

In [4]:
joined_train_data.head()

Unnamed: 0,user_id,video_id,time,watch_ratio,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,fans_user_num,friend_user_num,register_days,author_id,video_type,video_tag_name,video_duration,show_cnt,play_cnt,play_duration,like_cnt,comment_cnt,share_cnt,follow_cnt,collect_cnt,manual_cover_text,caption,topic_tag,first_level_category_name,second_level_category_name,third_level_category_name,english_caption,english_first_level_category_name,english_second_level_category_name,english_third_level_category_name,english_topic_tag
0,14,148,2020-07-05 05:27:48.378,0.722103,full_active,0,0,1,73,6,1,279,6332,NORMAL,"女青年, 生活, 男孩",6066.0,4707742,4912021,61121035381,31199,4574,577,2081,7.0,UNKNOWN,美60岁奶奶与21少年一见钟情，兴奋分享初次体验，称：升华了感情！ @推广小助手(O40300047),[],情感,情感关系,亲密关系,"A 60-year-old American grandmother fell in love with a 21-year-old boy at first sight, and excitedly shared her initial experience, saying: ""It has elevated our relationship!""",Emotion,Emotional relationships,Intimate relationship,[]
1,14,183,2020-07-05 05:28:00.057,1.907377,full_active,0,0,1,73,6,1,279,7626,NORMAL,"资讯, 饮料",6100.0,8921102,9171385,122510986250,210670,5684,1496,5427,10.0,UNKNOWN,合肥高铁南站，他带了瓶开过封的茅台被拦下，酒值4000元他舍不得扔，就一饮而尽。,[],民生资讯,社会事件,UNKNOWN,"At Hefei High-Speed Rail South Station, he was stopped when he tried to bring a half-open bottle of Maotai, which was worth 4000 yuan. He couldn't bear to throw it away and drank it all in one go.",Public information on livelihood issues,Social events,UNKNOWN,[]
2,14,3649,2020-07-05 05:29:09.479,2.063311,full_active,0,0,1,73,6,1,279,7136,NORMAL,"搞笑, 电梯",10866.0,13817219,14272327,244388991792,279456,5900,8383,17386,7.0,UNKNOWN,美女裤兜上插“菜刀”吓坏旁人 哦豁，拔出来才发现是手机壳,[],民生资讯,社会事件,社会新闻,"A beautiful lady had a ""knife"" stuck in her pants pocket, scaring people nearby; oh well, it turned out to be just a phone case.",Public information on livelihood issues,Social events,Social news,[]
3,14,5262,2020-07-05 05:30:43.285,0.566388,full_active,0,0,1,73,6,1,279,1854,NORMAL,"亲子, 新生儿",7907.0,23882869,23198459,265532315976,1363469,29188,8583,112330,576.0,生了个洋娃娃,生了个洋娃娃，婆婆非要发出来给大家看看，看看有没有人点赞，一分钟催我看一下，一分钟催我看一下😂,[],时尚,穿搭,UNKNOWN,"Had a doll baby, grandma insisted on showing it to everyone to see if anyone would like it. She kept urging me to take a look in one minute, and then urged me again in another minute 😂",Fashion,Dressing up,UNKNOWN,[]
4,14,8234,2020-07-05 05:35:43.459,0.418364,full_active,0,0,1,73,6,1,279,395,NORMAL,"娱乐, 舞台",11000.0,6336771,6038233,85542717277,87997,4328,181,4801,3.0,小时候火遍大江南北的小和尚 郝劭文,#郝劭文 小时候火遍全国，跟释小龙一起成为一代人最喜爱的童星，长大后却没能大火，你还记得那个可爱的小和尚吗？,[郝劭文],明星娱乐,娱乐八卦,饭制,"# Hao Shaowen was widely popular across China in his childhood, alongside Shi Xiaolong, becoming one of the most beloved child stars of a generation. Yet, as he grew up, he didn't achieve the same level of fame. Do you still remember that cute little monk?",Stars' Entertainment,Entertainment gossip,Food preparation,[Gao Shaowen]


# User Features

In [5]:
# is_new_user: 1 if register_days <= 30, else 0
joined_train_data['is_new_user'] = joined_train_data['register_days'].apply(lambda x: 1 if x <= 30 else 0)

# Total Connections
joined_train_data['total_connections'] = joined_train_data['follow_user_num'] + joined_train_data['fans_user_num'] + joined_train_data['friend_user_num']

# is_content_creator: Logical OR operation of 'is_live_streamer' and 'is_video_author'
joined_train_data['is_content_creator'] = joined_train_data[['is_live_streamer', 'is_video_author']].max(axis=1)

# Video Features

In [6]:
# # Function to generate embeddings
# def generate_embeddings(column):
#     vectorizer = TfidfVectorizer()
#     embeddings = vectorizer.fit_transform(joined_train_data[column].fillna(''))
#     # Convert embeddings to DataFrame
#     embeddings_df = pd.DataFrame(embeddings.toarray(), columns=[f"{column}_embedding_{i}" for i in range(embeddings.shape[1])])
#     return embeddings_df

# # Caption Embedding
# caption_embeddings = generate_embeddings('caption')
# joined_train_data = pd.concat([joined_train_data.reset_index(drop=True), caption_embeddings], axis=1)

# # Topic Tag Embedding
# topic_embeddings = generate_embeddings('topic_tag')
# joined_train_data = pd.concat([joined_train_data.reset_index(drop=True), topic_embeddings], axis=1)

# # First Level Category Name Embedding
# first_level_embeddings = generate_embeddings('first_level_category_name')
# joined_train_data = pd.concat([joined_train_data.reset_index(drop=True), first_level_embeddings], axis=1)

# # Second Level Category Name Embedding
# second_level_embeddings = generate_embeddings('second_level_category_name')
# joined_train_data = pd.concat([joined_train_data.reset_index(drop=True), second_level_embeddings], axis=1)

# # Third Level Category Name Embedding
# third_level_embeddings = generate_embeddings('third_level_category_name')
# joined_train_data = pd.concat([joined_train_data.reset_index(drop=True), third_level_embeddings], axis=1)

# User-Video Interaction Features

In [7]:
# Ensure 'time' column is in datetime format
joined_train_data['time'] = pd.to_datetime(joined_train_data['time'])

# Extract hour and day of the week
joined_train_data['hour'] = joined_train_data['time'].dt.hour
joined_train_data['day_of_week'] = joined_train_data['time'].dt.dayofweek

# Watch Frequency: Average number of videos watched per day per user
user_activity = joined_train_data.groupby('user_id').agg({
    'time': [np.min, np.max, 'count']
})
user_activity.columns = ['first_interaction', 'last_interaction', 'total_interactions']
user_activity['days_active'] = (user_activity['last_interaction'] - user_activity['first_interaction']).dt.days + 1
user_activity['watch_frequency'] = user_activity['total_interactions'] / user_activity['days_active']
joined_train_data = joined_train_data.merge(user_activity['watch_frequency'], on='user_id', how='left')

# is_weekend: Proportion of interactions occurring on weekends per user
joined_train_data['is_weekend_interaction'] = joined_train_data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
weekend_proportion = joined_train_data.groupby('user_id')['is_weekend_interaction'].mean().reset_index(name='is_weekend')
joined_train_data = joined_train_data.merge(weekend_proportion, on='user_id', how='left')

# Count of views in different time periods
def get_time_period(hour):
    if 0 <= hour <= 5:
        return 'midnight'
    elif 6 <= hour <= 11:
        return 'morning'
    elif 12 <= hour <= 17:
        return 'afternoon'
    else:
        return 'evening'

joined_train_data['time_period'] = joined_train_data['hour'].apply(get_time_period)
time_period_counts = joined_train_data.pivot_table(index='user_id', columns='time_period', values='video_id', aggfunc='count', fill_value=0).reset_index()
joined_train_data = joined_train_data.merge(time_period_counts, on='user_id', how='left')

# Rename time period columns
joined_train_data.rename(columns={
    'afternoon': 'count_afternoon_views',
    'evening': 'count_evening_views',
    'midnight': 'count_midnight_views',
    'morning': 'count_morning_views'
}, inplace=True)

# Average Daily Watch Time: Average time spent watching videos per day
total_play_duration = joined_train_data.groupby('user_id')['play_duration'].sum().reset_index()
total_play_duration = total_play_duration.merge(user_activity['days_active'], on='user_id')
total_play_duration['avg_daily_watch_time'] = total_play_duration['play_duration'] / total_play_duration['days_active']
joined_train_data = joined_train_data.merge(total_play_duration[['user_id', 'avg_daily_watch_time']], on='user_id', how='left')

# Top 3 Categories: Top 3 video categories that users have interacted with
def top_categories(group):
    return group['first_level_category_name'].value_counts().nlargest(3).index.tolist()

user_top_categories = joined_train_data.groupby('user_id').apply(top_categories).reset_index()
user_top_categories.columns = ['user_id', 'top_3_categories']
joined_train_data = joined_train_data.merge(user_top_categories, on='user_id', how='left')

  user_activity = joined_train_data.groupby('user_id').agg({
  user_activity = joined_train_data.groupby('user_id').agg({
  user_top_categories = joined_train_data.groupby('user_id').apply(top_categories).reset_index()


In [8]:
import os

In [9]:
joined_train_data.to_csv('data_exports/joined_train_data_FE.csv', index=False)