In [None]:
import pandas as pd

# data = pd.read_csv('./weibo_train_data.txt', sep='\t', header=None)

# data.columns = ['uid', 'mid', 'time', 'forward_count', 'comment_count', 'like_count', 'content']

with open('./WeiboData/weibo_train_data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
print(f"文件总行数：{len(lines)}")

data = [line.strip().split('\t') for line in lines]
# 将列表转换为DataFrame
data = pd.DataFrame(data, columns=['uid', 'mid', 'time', 'forward_count', 'comment_count', 'like_count', 'content'])

total_rows = data.shape[0]  # 使用 shape 属性获取行数
print("\n总行数：", total_rows)


print(data.head())


文件总行数：1229618

总行数： 1229618
                                uid                               mid  \
0  d38e9bed5d98110dc2489d0d1cac3c2a  7d45833d9865727a88b960b0603c19f6   
1  fa13974743d3fe6ff40d21b872325e9e  8169f1d45051e08ef213bf1106b1225d   
2  da534fe87e7a52777bee5c30573ed5fd  68cd0258c31c2c525f94febea2d9523b   
3  e06a22b7e065e559a1f0bf7841a85c51  00b9f86b4915aedb7db943c54fd19d59   
4  f9828598f9664d4e347ef2048ce17734  c7f6f66044c0c5a3330e2c5371be6824   

                  time forward_count comment_count like_count  \
0  2015-02-23 17:41:29             0             0          0   
1  2015-02-14 12:49:58             0             0          0   
2  2015-03-31 13:58:06             0             0          0   
3  2015-06-11 20:39:57             0             4          3   
4  2015-03-10 18:02:38             0             0          0   

                                             content  
0  丽江旅游(sz002033)#股票##炒股##财经##理财##投资#推荐包赢股，盈利对半分成...  
1  #丁辰灵的红包#挣钱是一种能力，抢红包拼的是技术。我抢到了

In [5]:
data['forward_count'] = pd.to_numeric(data['forward_count'], errors='coerce')
data['comment_count'] = pd.to_numeric(data['comment_count'], errors='coerce')
data['like_count'] = pd.to_numeric(data['like_count'], errors='coerce')

In [9]:
total_rows = data.shape[0]  # 使用 shape 属性获取行数
print("\n总行数：", total_rows)
print(data.dtypes)


总行数： 1229618
uid              object
mid              object
time             object
forward_count     int64
comment_count     int64
like_count        int64
content          object
dtype: object


In [None]:

# 提取特征
# 1. number_in_train
user_counts = data['uid'].value_counts().reset_index()
user_counts.columns = ['uid', 'number_in_train']

# 2. forward_max, comment_max, like_max
max_features = data.groupby('uid').agg({
    'forward_count': 'max',
    'comment_count': 'max',
    'like_count': 'max'
}).reset_index()
max_features.columns = ['uid', 'forward_max', 'comment_max', 'like_max']

# 3. forward_min, comment_min, like_min
min_features = data.groupby('uid').agg({
    'forward_count': 'min',
    'comment_count': 'min',
    'like_count': 'min'
}).reset_index()
min_features.columns = ['uid', 'forward_min', 'comment_min', 'like_min']

# 4. forward_mean, comment_mean, like_mean
mean_features = data.groupby('uid').agg({
    'forward_count': 'mean',
    'comment_count': 'mean',
    'like_count': 'mean'
}).reset_index()
mean_features.columns = ['uid', 'forward_mean', 'comment_mean', 'like_mean']

# 5. forward_judge, comment_judge, like_judge
def calculate_judge(group):
    return pd.Series({
        'forward_judge': (group['forward_count'] > group['forward_count'].mean()).sum(),
        'comment_judge': (group['comment_count'] > group['comment_count'].mean()).sum(),
        'like_judge': (group['like_count'] > group['like_count'].mean()).sum()
    })

judge_features = data.groupby('uid').apply(calculate_judge).reset_index()

# 合并所有特征
features = user_counts.merge(max_features, on='uid', how='left')
features = features.merge(min_features, on='uid', how='left')
features = features.merge(mean_features, on='uid', how='left')
features = features.merge(judge_features, on='uid', how='left')

# 保存特征
features.to_csv('./features/train_user_features.csv', index=False)

print("特征提取完成，结果已保存到文件中。")

特征提取完成，结果已保存到文件中。


In [None]:
# 转换时间格式
data['time'] = pd.to_datetime(data['time'], format='%Y-%m-%d %H:%M:%S')

# 提取特征
# 1. time_weekday
time_weekday = data['time'].dt.dayofweek + 1  # 星期一为1，星期日为7

# 2. time_weekend
time_weekend = time_weekday.apply(lambda x: 1 if x in [6, 7] else 0)  # 星期六和星期日为周末

# 3. time_hour
time_hour = data['time'].dt.hour + 1  # 将小时转换为1到24

# 4. panduan
def judge_period(hour):
    if 1 <= hour <= 6:
        return 1  # 凌晨
    elif 7 <= hour <= 12:
        return 2  # 上午
    elif 13 <= hour <= 18:
        return 3  # 下午
    else:
        return 4  # 晚上

panduan = time_hour.apply(judge_period)

# 创建一个新的DataFrame来存储这些特征
time_features = pd.DataFrame({
    'uid': data['uid'], 
    'mid': data['mid'],  # 保留mid以便与原始数据关联
    'time_weekday': time_weekday,
    'time_weekend': time_weekend,
    'time_hour': time_hour,
    'panduan': panduan
})

# 保存结果
time_features.to_csv('./features/train_time_features.csv', index=False)

print("特征提取完成，结果已保存到文件中。")

特征提取完成，结果已保存到文件中。


In [None]:
import pandas as pd
import re

# 处理缺失值：将缺失的content替换为空字符串
data['content'].fillna('', inplace=True)

# 初始化一个空的DataFrame来存储文本特征
text_features = pd.DataFrame()

# 提取文本特征
# 1. length_all
text_features['length_all'] = data['content'].apply(len)

# 2. length_chinese
text_features['length_chinese'] = data['content'].apply(lambda x: len(re.findall(r'[\u4e00-\u9fff]', x)))

# 3. english
text_features['english'] = data['content'].apply(lambda x: 1 if len(re.findall(r'[a-zA-Z]', x)) > len(x) / 2 else 0)

# 4. non_ch
text_features['non_ch'] = data['content'].apply(lambda x: 1 if len(re.findall(r'[\u4e00-\u9fff]', x)) < len(x) / 2 else 0)

# 5. sharing
text_features['sharing'] = data['content'].apply(lambda x: 1 if re.search(r'分享自|分享自|转自', x) else 0)

# 6. auto
text_features['auto'] = data['content'].apply(lambda x: 1 if re.search(r'我…了|我…了|我…了', x) and ('@' in x or 'http' in x) else 0)

# 7. interaction
text_features['interaction'] = data['content'].apply(lambda x: 1 if re.search(r'//', x) and not re.search(r'http://', x) else 0)

# 8. book
text_features['book'] = data['content'].apply(lambda x: 1 if re.search(r'《[^》]*》', x) else 0)

# 9. mention
text_features['mention'] = data['content'].apply(lambda x: 1 if '@' in x else 0)

# 10. vote
text_features['vote'] = data['content'].apply(lambda x: 1 if re.search(r'投票|投票', x) else 0)

# 11. lottery
text_features['lottery'] = data['content'].apply(lambda x: 1 if re.search(r'抽奖|抽奖', x) else 0)

# 12. emoji
text_features['emoji'] = data['content'].apply(lambda x: 1 if re.search(r'[^\u0000-\uFFFF]', x) else 0)

# 13. video
text_features['video'] = data['content'].apply(lambda x: 1 if re.search(r'http://v\.weibo\.com|http://t\.cn', x) else 0)

# 添加mid列以便与原始数据关联
text_features['mid'] = data['mid']
text_features['uid'] = data['uid']

# 重新排列列的顺序，将mid列放在第一列
text_features = text_features[['uid','mid', 'length_all', 'length_chinese', 'english', 'non_ch', 'sharing', 'auto', 'interaction', 'book', 'mention', 'vote', 'lottery', 'emoji', 'video']]

# 保存结果
text_features.to_csv('./features/train_text_features.csv', index=False)

print("特征提取完成，结果已保存到文件中。")

特征提取完成，结果已保存到文件中。


In [15]:
# !pip install jieba

In [None]:
# import jieba
from collections import Counter

# 处理缺失值：将缺失的content替换为空字符串
data['content'].fillna('', inplace=True)

# 初始化一个空的DataFrame来存储文本特征
text_features = pd.DataFrame()

# 提取文本特征
# 1. http
text_features['http'] = data['content'].apply(lambda x: 1 if re.search(r'http://|https://', x) else 0)

# 2. stock
text_features['stock'] = data['content'].apply(lambda x: 1 if re.search(r'股票|股市|涨停|跌停|证券', x) else 0)

# 3. app
text_features['app'] = data['content'].apply(lambda x: 1 if re.search(r'我在#', x) else 0)

# 4. title
text_features['title'] = data['content'].apply(lambda x: 1 if re.search(r'【[^】]*】', x) else 0)

# 5. ad
text_features['ad'] = data['content'].apply(lambda x: 1 if re.search(r'广告|推广|赞助|合作', x) else 0)

# 6. keywords
# 使用jieba分词提取高频热词
all_words = ' '.join(data['content']).split()
word_counts = Counter(all_words)
high_freq_words = [word for word, count in word_counts.items() if count > 100]  # 假设高频词出现次数大于100
text_features['keywords'] = data['content'].apply(lambda x: 1 if any(word in x for word in high_freq_words) else 0)

# 添加mid列以便与原始数据关联
text_features['mid'] = data['mid']
text_features['uid'] = data['uid']

# 重新排列列的顺序，将mid列放在第一列
text_features = text_features[['uid','mid', 'http', 'stock', 'app', 'title', 'ad', 'keywords']]

# 保存结果
text_features.to_csv('./features/train_text_features2.csv', index=False)

print("特征提取完成，结果已保存到文件中。")

特征提取完成，结果已保存到文件中。


In [None]:
with open('./WeiboData/weibo_predict_data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
print(f"文件总行数：{len(lines)}")
# 检查空行
empty_lines = [i for i, line in enumerate(lines) if line.strip() == '']
print(f"空行的行号：{empty_lines}")

文件总行数：178297
空行的行号：[]


In [18]:
data = [line.strip().split('\t') for line in lines]
# 将列表转换为DataFrame
data = pd.DataFrame(data, columns=['uid', 'mid', 'time', 'content'])

total_rows = data.shape[0]  # 使用 shape 属性获取行数
print("\n总行数：", total_rows)



总行数： 178297


In [19]:
total_rows = data.shape[0]  # 使用 shape 属性获取行数
print("\n总行数：", total_rows)


总行数： 178297


In [None]:
# 转换时间格式
data['time'] = pd.to_datetime(data['time'], format='%Y-%m-%d %H:%M:%S')

# 提取特征
# 1. time_weekday
time_weekday = data['time'].dt.dayofweek + 1  # 星期一为1，星期日为7

# 2. time_weekend
time_weekend = time_weekday.apply(lambda x: 1 if x in [6, 7] else 0)  # 星期六和星期日为周末

# 3. time_hour
time_hour = data['time'].dt.hour + 1  # 将小时转换为1到24

# 4. panduan
def judge_period(hour):
    if 1 <= hour <= 6:
        return 1  # 凌晨
    elif 7 <= hour <= 12:
        return 2  # 上午
    elif 13 <= hour <= 18:
        return 3  # 下午
    else:
        return 4  # 晚上

panduan = time_hour.apply(judge_period)

# 创建一个新的DataFrame来存储这些特征
time_features = pd.DataFrame({
    'uid': data['uid'],
    'mid': data['mid'],  # 保留mid以便与原始数据关联
    'time_weekday': time_weekday,
    'time_weekend': time_weekend,
    'time_hour': time_hour,
    'panduan': panduan
})

# 保存结果
time_features.to_csv('./features/predict_time_features.csv', index=False)

print("特征提取完成，结果已保存到文件中。")

特征提取完成，结果已保存到文件中。


In [None]:
import pandas as pd
import re

# 处理缺失值：将缺失的content替换为空字符串
data['content'].fillna('', inplace=True)

# 初始化一个空的DataFrame来存储文本特征
text_features = pd.DataFrame()

# 提取文本特征
# 1. length_all
text_features['length_all'] = data['content'].apply(len)

# 2. length_chinese
text_features['length_chinese'] = data['content'].apply(lambda x: len(re.findall(r'[\u4e00-\u9fff]', x)))

# 3. english
text_features['english'] = data['content'].apply(lambda x: 1 if len(re.findall(r'[a-zA-Z]', x)) > len(x) / 2 else 0)

# 4. non_ch
text_features['non_ch'] = data['content'].apply(lambda x: 1 if len(re.findall(r'[\u4e00-\u9fff]', x)) < len(x) / 2 else 0)

# 5. sharing
text_features['sharing'] = data['content'].apply(lambda x: 1 if re.search(r'分享自|分享自|转自', x) else 0)

# 6. auto
text_features['auto'] = data['content'].apply(lambda x: 1 if re.search(r'我…了|我…了|我…了', x) and ('@' in x or 'http' in x) else 0)

# 7. interaction
text_features['interaction'] = data['content'].apply(lambda x: 1 if re.search(r'//', x) and not re.search(r'http://', x) else 0)

# 8. book
text_features['book'] = data['content'].apply(lambda x: 1 if re.search(r'《[^》]*》', x) else 0)

# 9. mention
text_features['mention'] = data['content'].apply(lambda x: 1 if '@' in x else 0)

# 10. vote
text_features['vote'] = data['content'].apply(lambda x: 1 if re.search(r'投票|投票', x) else 0)

# 11. lottery
text_features['lottery'] = data['content'].apply(lambda x: 1 if re.search(r'抽奖|抽奖', x) else 0)

# 12. emoji
text_features['emoji'] = data['content'].apply(lambda x: 1 if re.search(r'[^\u0000-\uFFFF]', x) else 0)

# 13. video
text_features['video'] = data['content'].apply(lambda x: 1 if re.search(r'http://v\.weibo\.com|http://t\.cn', x) else 0)

# 添加mid列以便与原始数据关联
text_features['mid'] = data['mid']
text_features['uid'] = data['uid']

# 重新排列列的顺序，将mid列放在第一列
text_features = text_features[['uid','mid', 'length_all', 'length_chinese', 'english', 'non_ch', 'sharing', 'auto', 'interaction', 'book', 'mention', 'vote', 'lottery', 'emoji', 'video']]

# 保存结果
text_features.to_csv('./features/predict_text_features.csv', index=False)

print("特征提取完成，结果已保存到文件中。")

特征提取完成，结果已保存到文件中。


In [None]:
# import jieba
from collections import Counter

# 处理缺失值：将缺失的content替换为空字符串
data['content'].fillna('', inplace=True)

# 初始化一个空的DataFrame来存储文本特征
text_features = pd.DataFrame()

# 提取文本特征
# 1. http
text_features['http'] = data['content'].apply(lambda x: 1 if re.search(r'http://|https://', x) else 0)

# 2. stock
text_features['stock'] = data['content'].apply(lambda x: 1 if re.search(r'股票|股市|涨停|跌停|证券', x) else 0)

# 3. app
text_features['app'] = data['content'].apply(lambda x: 1 if re.search(r'我在#', x) else 0)

# 4. title
text_features['title'] = data['content'].apply(lambda x: 1 if re.search(r'【[^】]*】', x) else 0)

# 5. ad
text_features['ad'] = data['content'].apply(lambda x: 1 if re.search(r'广告|推广|赞助|合作', x) else 0)

# 6. keywords
# 使用jieba分词提取高频热词
all_words = ' '.join(data['content']).split()
word_counts = Counter(all_words)
high_freq_words = [word for word, count in word_counts.items() if count > 100]  # 假设高频词出现次数大于100
text_features['keywords'] = data['content'].apply(lambda x: 1 if any(word in x for word in high_freq_words) else 0)

# 添加mid列以便与原始数据关联
text_features['mid'] = data['mid']
text_features['uid'] = data['uid']

# 重新排列列的顺序，将mid列放在第一列
text_features = text_features[['uid','mid', 'http', 'stock', 'app', 'title', 'ad', 'keywords']]

# 保存结果
text_features.to_csv('./features/predict_text_features2.csv', index=False)

print("特征提取完成，结果已保存到文件中。")

特征提取完成，结果已保存到文件中。
