若不禁用 quoting 则 pandas 将读取 1,225,088 行，其中一部分数据行被双引号打乱

认为 1,229,618 是正确的数据总行数

In [None]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt

col_names_train = ['uid', 'mid', 'time', 'forward_count', 'comment_count', 'like_count', 'content']
col_names_predict = ['uid', 'mid', 'time', 'content']
output_columns = ['like_count', 'forward_count', 'comment_count']

train_raw_df = pd.read_table('./data/raw/weibo_train_data.txt', 
                             names=col_names_train, 
                             quotechar=None, quoting=csv.QUOTE_NONE)
test_raw_df = pd.read_table('./data/raw/weibo_predict_data.txt', 
                            names=col_names_predict, 
                            quotechar=None, quoting=csv.QUOTE_NONE)

# def count_log_scale(df: pd.DataFrame):
#     for col in output_columns:
#         df[col] = df[col].map(lambda x: np.log(x + 1))
# count_log_scale(train_raw_df)
train_raw_df.info()

In [None]:
print('num of unique users:', len(train_raw_df['uid'].unique()))
print('num of posts:', len(train_raw_df['mid'].unique()))

In [None]:
df = train_raw_df
def sum_interaction(row):
    return (row['forward_count'] + row['comment_count'] + row['like_count'])

non_zero_posts = df[df.apply(sum_interaction, axis=1) != 0]
len(non_zero_posts) / len(df)

统计文本长度分布

In [None]:
text_length = train_raw_df['content'].astype(str).apply(len).to_numpy()
print(np.mean(text_length), np.std(text_length))

fig = plt.figure(figsize=(12,5))
plt.hist(text_length, bins=range(0, text_length.max() + 1), align='left', rwidth=0.8)
plt.xlabel('Text Length by Characters')
plt.ylabel('Num of samples')
plt.title('Text Length Distribution')
plt.show()

In [None]:
like_count = train_raw_df['like_count'].value_counts()
forward_count = train_raw_df['forward_count'].value_counts()
comment_count = train_raw_df['comment_count'].value_counts()

fig, axs = plt.subplots(1, 3, figsize=(16, 4))
# plt.scatter(like_count.index, like_count.values, color='skyblue')

axs[0].scatter(like_count.index, like_count.values, alpha=0.5)
axs[0].set_title('Number of Likes Distribution')
axs[0].set_xscale('log')
axs[0].set_yscale('log')

axs[1].scatter(forward_count.index, forward_count.values, alpha=0.5)
axs[1].set_title('Number of Forwards Distribution')
axs[1].set_xscale('log')
axs[1].set_yscale('log')

axs[2].scatter(comment_count.index, comment_count.values, alpha=0.5)
axs[2].set_title('Number of Comments Distribution')
axs[2].set_xscale('log')
axs[2].set_yscale('log')

plt.tight_layout()
# plt.title('Distribution of Likes/Forwards/Comments')
plt.show()

In [None]:

hours = pd.to_datetime(train_raw_df['time']).dt.hour
plt.figure(figsize=(10, 6))
# plt.hist(hours, bins=24, color='skyblue', edgecolor='black')
plt.bar(hours.value_counts().index, hours.value_counts().values)

# Set labels and title
plt.xlabel('Hour of the Day')
plt.ylabel('Frequency')
plt.title('Distribution of Time')

plt.xticks(range(0, 24))
plt.show()

In [None]:
from collections import Counter
from tqdm import tqdm
all_users = train_raw_df['uid'].unique()

# 累加每个用户的所有博文获得的点赞/转发/评论
user_total_likes = {u: 0 for u in all_users}
user_total_forwards = {u: 0 for u in all_users}
user_total_comments = {u: 0 for u in all_users}
for index, row in tqdm(train_raw_df.iterrows(), total=len(train_raw_df)):
    user_total_likes[row['uid']] = user_total_likes[row['uid']] + int(row['like_count'])
    user_total_forwards[row['uid']] = user_total_forwards[row['uid']] + int(row['forward_count'])
    user_total_comments[row['uid']] = user_total_comments[row['uid']] + int(row['comment_count'])


user_likes_counter = Counter(user_total_likes.values())
user_forwards_counter = Counter(user_total_forwards.values())
user_comments_counter = Counter(user_total_comments.values())

plt.figure(figsize=(10, 6))
fig, axs = plt.subplots(1, 3, figsize=(16, 4))

axs[0].scatter(user_likes_counter.keys(), user_likes_counter.values(), alpha=0.5)
axs[0].set_title('Number of Likes Distribution (by user)')
axs[0].set_xscale('log')
axs[0].set_yscale('log')

axs[1].scatter(user_forwards_counter.keys(), user_forwards_counter.values(), alpha=0.5)
axs[1].set_title('Number of Forwards Distribution (by user)')
axs[1].set_xscale('log')
axs[1].set_yscale('log')

axs[2].scatter(user_comments_counter.keys(), user_comments_counter.values(), alpha=0.5)
axs[2].set_title('Number of Comments Distribution (by user)')
axs[2].set_xscale('log')
axs[2].set_yscale('log')

plt.tight_layout()
# plt.title('Distribution of Likes/Forwards/Comments')
plt.show()

查看训练集和测试集中 uid 的重叠情况

In [None]:
users_train = train_raw_df['uid'].unique()
users_test = test_raw_df['uid'].unique()

print('train:', len(set(users_train)))
print('test:', len(set(users_test)))
print(len(set.intersection(set(users_test), set(users_train))))
print('测试集中 未在训练集中出现的 uid 数量', len(set.difference(set(users_test), set(users_train))))

Time Span

In [None]:
all_datetimes = pd.to_datetime(train_raw_df['time'])
print( all_datetimes.min(), all_datetimes.max() )
valid = all_datetimes[all_datetimes > pd.to_datetime('2015-07-01 00:00:00')]

print( valid.min(), valid.max() )