In [4]:
import pandas as pd
from pathlib import Path
import json
import pendulum
import numpy as np

In [6]:
in_dir = "/Users/kay/Papers/论文合作/于孟利/weibo_data_analysis-20210210/weibo_data_json"

In [39]:
def day_night(time_list):
    '''
    发微博的时间序列转化为4个时间段的统计
    :param in_name:
    :param out_name:
    :return:
    '''
    cnt = [0] * 24
    for t in time_list:
        hour = t.hour
        # print(hour)
        # 分为四个时间段
        # cnt[int(int(hour) / 6)] += 1
        # 分为24小时
        cnt[int(hour)] += 1
    return cnt


def weeks(time_list):
    '''
    发微博的时间序列转化为7个时间段的统计
    :param in_name:
    :param out_name:
    :return:
    '''
    cnt = [0] * 7
    for t in time_list:
        weekday = t.weekday()
        cnt[int(weekday)] += 1
    return cnt


def time_interval(time_list):
    interval = []
    for i in np.arange(1, len(time_list)):
        t2 = time_list[i-1]
        t1 = time_list[i]
        _inter = abs((t2 - t1).total_seconds())
        if _inter < 86400 * 7:
            interval.append(_inter / 3600)
    # return interval
    interval = np.array(interval)
    # print(interval)
    return interval


def life_length(time_list):
    return int((max(time_list) - min(time_list)).total_seconds() / 3600 / 24) + 1

In [41]:
# 时间分析

'''
Basic features. Basic features are selected to reﬂect the user’s demographics, preliminary statuses and elementary interactions on social media, including gender, tweeting patterns and privacy settings. 

Speciﬁcally, tweeting patterns contain log(AUW + 1) (where AUW is the age of a user on Weibo in units of days), log(NT + 1) (where NT is the total number of tweets the user posted), log(NT/(AUW + 1)) (the frequency of posting), log(NFER + 1) (where NFER is deﬁned as the number of the user’s followers), log(NFEE + 1) (where NFEE denotes the number of the user’s followees), NT/(NFER + 1), and NT/(NFEE + 1). 

With respect to the privacy settings, corresponding binary features indicate whether a user allows comments from others, whether the user allows private messages sent from others and whether the user allows Weibo to track their real-time location. In addition, we consider the length of self-description as a feature.
'''
users_features = []

for in_name in Path(in_dir).rglob("*.json"):
    # print(in_name)
    d = json.load(open(in_name))
    user = d["user"]
    u = {
        "uid": user["id"],
        "b:NT": np.log(user["weibo_num"] + 1),
        "b:NFEE": np.log(user["following"] + 1),
        "b:NEER": np.log(user["followers"] + 1),
        "b:NT/NFEE": (user["weibo_num"] + 1) / (user["following"] + 1),
        "b:NT/NFER": (user["weibo_num"] + 1) / (user["followers"] + 1),
        "b:NEEE/NFER": (user["following"] + 1) / (user["followers"] + 1),
    }

    weibos = d["weibo"]
    # 所有微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos]
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"t:h_{i}"] = _d
    u["t:h_max"] = d_features.max()
    u["t:h_argmax"] = d_features.argmax()
    u["t:h_std"] = d_features.std()
    
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"t:w_{i}"] = _d
    u["t:h_max"] = w_features.max()
    u["t:h_argmax"] = w_features.argmax()
    u["t:h_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["t:interval_mean"] = 0
        u["t:interval_std"] = 0
    else:
        u["t:interval_mean"] = interval.mean()
        u["t:interval_std"] = interval.std()

    u["t:life_length"] = life_length(time_list)
    u["t:ave_d_num"] = user["weibo_num"] / u["t:life_length"]

    # 转发微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos if not w["original"]]
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"t:h_{i}"] = _d
    u["t:ret_h_max"] = d_features.max()
    u["t:ret_h_argmax"] = d_features.argmax()
    u["t:ret_h_std"] = d_features.std()
    
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"t:w_{i}"] = _d
    u["t:ret_h_max"] = w_features.max()
    u["t:ret_h_argmax"] = w_features.argmax()
    u["t:ret_h_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["t:ret_interval_mean"] = 0
        u["t:ret_interval_std"] = 0
    else:
        u["t:ret_interval_mean"] = interval.mean()
        u["t:ret_interval_std"] = interval.std()

    users_features.append(u)
    # print(user)

len(users_features)

df = pd.DataFrame(users_features).set_index("uid")
df.to_csv("data/MengLi_users_features.csv", float_format="%.4f")
df


Unnamed: 0_level_0,b:NT,b:NFEE,b:NEER,b:NT/NFEE,b:NT/NFER,b:NEEE/NFER,t:h_0,t:h_1,t:h_2,t:h_3,...,t:w_6,t:interval_mean,t:interval_std,t:life_length,t:ave_d_num,t:ret_h_max,t:ret_h_argmax,t:ret_h_std,t:ret_interval_mean,t:ret_interval_std
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2982254865,6.063785,5.093750,5.407172,2.638037,1.928251,0.730942,9,1,0,0,...,37,42.171218,45.922626,2724,0.157489,37,6,7.284314,37.741751,49.003489
3943065337,9.169727,6.356108,7.021084,16.670139,8.573214,0.514286,1,1,0,0,...,0,1.733333,2.063783,41,234.170732,11,1,3.870348,1.980952,2.287326
3968712237,5.743003,5.860786,5.176150,0.888889,1.762712,1.983051,1,0,0,0,...,8,22.461227,29.749968,683,0.455344,25,0,5.229430,44.960063,42.473738
5013416876,6.495266,5.852202,5.402677,1.902299,2.981982,1.567568,14,5,1,0,...,19,38.791653,35.454522,2334,0.283205,36,1,6.577637,64.989949,47.067363
5230736039,4.976734,5.347108,4.820282,0.690476,1.169355,1.693548,3,0,0,0,...,5,30.069771,36.605034,1078,0.133581,19,2,6.220440,27.297083,47.639542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1278570913,5.897154,5.293305,4.094345,1.829146,6.066667,3.316667,6,8,2,0,...,9,17.561508,30.821601,2817,0.128860,58,4,16.061108,16.154988,29.744955
5236316239,7.216709,5.517453,4.189655,5.469880,20.636364,3.772727,0,1,0,0,...,65,14.999903,34.455127,1738,0.783084,118,2,22.106422,14.536835,34.818574
2678428523,6.075346,4.304065,5.676754,5.878378,1.489726,0.253425,2,4,1,3,...,83,38.036657,45.881736,2635,0.164706,83,6,13.468027,38.672378,47.057762
3078158207,5.669881,5.552960,5.468060,1.124031,1.223629,1.088608,9,2,0,0,...,20,66.797996,51.967102,2803,0.103104,20,5,5.394631,54.015079,51.716291


In [74]:
survey_data = pd.read_excel("/Users/kay/Papers/论文合作/于孟利/456ID_DVs_0610.xlsx")
survey_data["userID_num"].astype("str")
survey_data = survey_data.set_index("userID_num")
survey_data.index.name = "uid"
# survey_data

In [64]:
w_feas = pd.read_csv("data/MengLi_users_features.csv")
w_feas["uid"].astype("str")
w_feas = w_feas.set_index("uid")
cols_feas = w_feas.columns
# w_feas

In [75]:
all_data = survey_data.join(w_feas).copy()

In [63]:
all_data

Unnamed: 0_level_0,num,提交答卷时间,所用时间,nickname,userURL,userID,sex,age,Born,专业,...,t:w_6,t:interval_mean,t:interval_std,t:life_length,t:ave_d_num,t:ret_h_max,t:ret_h_argmax,t:ret_h_std,t:ret_interval_mean,t:ret_interval_std
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5822697591,10,2019/10/20 16:34:17,856秒,满天小xingx,https://weibo.com/u/5822697591,u/5822697591,2,23,1996,4,...,53,8.9384,13.1202,205,22.2829,77,2,9.9550,12.4226,17.1514
5866763968,18,2019/10/20 16:42:25,883秒,不要放蘑菇,https://weibo.com/u/5866763968,u/5866763968,1,20,1999,4,...,1,79.4000,65.5333,737,0.0163,1,5,0.4518,0.0000,0.0000
5311953749,28,2019/10/20 16:49:03,1440秒,彭老师不爱留作业,https://weibo.com/u/5311953749,u/5311953749,1,24,1995,4,...,59,32.7884,38.8448,1880,0.3346,100,2,16.0687,31.7671,37.7409
2366180120,31,2019/10/20 16:51:01,1024秒,cryptobiote,https://weibo.com/u/2366180120,u/2366180120,1,25,1994,1,...,12,48.7366,47.5292,2722,0.0577,20,5,3.6812,50.8385,50.2349
2769435005,32,2019/10/20 16:51:13,1090秒,不是很紧,https://weibo.com/u/2769435005,u/2769435005,1,22,1997,1,...,2,67.0278,45.0131,783,2.3052,3,0,1.0302,18.8000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2139304270,1450,2019/11/15 11:32:33,929秒,_怪兽回收处处长,https://weibo.com/u/2139304270,u/2139304270,1,21,1998,2,...,1,71.1806,41.7892,388,0.0747,7,3,2.0996,57.7700,40.0841
1822574395,1454,2019/11/15 11:48:23,1235秒,夷歆雯,https://weibo.com/u/1822574395,u/1822574395,2,21,1998,3,...,113,24.8451,32.0379,1970,2.2706,146,5,19.0692,29.3068,35.4219
6249948493,1461,2019/11/16 14:54:01,1002秒,·松栗奶油,https://weibo.com/u/6249948493,u/6249948493,2,20,1999,4,...,0,25.0861,26.1547,119,0.4286,0,0,0.0000,0.0000,0.0000
6069038567,1465,2019/11/17 14:58:06,1446秒,玺欢你andme,https://weibo.com/u/6069038567,u/6069038567,2,22,1997,4,...,11,13.8697,28.0563,1158,0.1287,11,6,3.1558,40.4106,34.9356


In [78]:
X = all_data[cols_feas.to_list()]
y = all_data[[c for c in survey_data.columns.to_list() if c.startswith("y_")]]

In [94]:
# from sklearn import tree
from sklearn.neural_network import MLPRegressor
# from sklearn.metrics import f1_score
from sklearn.metrics import r2_score


def train():
    # print(in_name)
    for col_name, y_i in y.iteritems():
        # print(col_name)
        y_i = np.array(y_i)
        reg = MLPRegressor()
        reg = reg.fit(X, y_i)

        y_pred = reg.predict(X)
        print(r2_score(y_i, y_pred))
        
train()

-1.9654408964607155
-1.9654408964607155
-2.9537590468036345
-2.9537590468036345
-6.888614857031046
-6.888614857031046
-0.7498276986438064
-0.7498276986438064
