In [2]:
import pandas as pd
from pathlib import Path
import json
import pendulum
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter

In [39]:
# 抽取用户特征

def day_night(time_list):
    '''
    发微博的时间序列转化为4个时间段的统计
    :param in_name:
    :param out_name:
    :return: 
    '''
    cnt = [0] * 24
    for t in time_list:
        hour = t.hour
        # print(hour)
        # 分为四个时间段
        # cnt[int(int(hour) / 6)] += 1
        # 分为24小时
        cnt[int(hour)] += 1
    return cnt


def weeks(time_list):
    '''
    发微博的时间序列转化为7个时间段的统计
    :param in_name:
    :param out_name:
    :return:
    '''
    cnt = [0] * 7
    for t in time_list:
        weekday = t.weekday()
        cnt[int(weekday)] += 1
    return cnt


def time_interval(time_list):
    interval = []
    for i in np.arange(1, len(time_list)):
        t2 = time_list[i-1]
        t1 = time_list[i]
        _inter = abs((t2 - t1).total_seconds())
        if _inter < 86400 * 7:
            interval.append(_inter / 3600)
    # return interval
    interval = np.array(interval)
    # print(interval)
    return interval


def life_length(time_list):
    return int((max(time_list) - min(time_list)).total_seconds() / 3600 / 24) + 1

In [49]:
from vocab import vocab

word_list = vocab.community_keywords \
            + vocab.financial_keywords \
            + vocab.popularity_keywords \
            + vocab.image_keywords \
            + vocab.health_keywords \
            + vocab.affiliation_keywords \
            + vocab.selfacceptance_keywords
word_list = list(set(word_list))
word_list.sort(key=lambda x: len(x), reverse=True)
# print(word_list)
len(word_list)
# with open("vocab/word_list.txt", "w") as f:
#     for w in word_list:
#         f.write(w + "\n")

3749

In [3]:
import thulac

thu = thulac.thulac(user_dict="vocab/word_list.txt", seg_only=True)  #默认模式
# segs = " ".join([w[0] for w in thu.cut("我爱北京天安门")])  #进行一句话分词
# print(segs)
def make_segs(text):
    return " ".join([w[0].strip() for w in thu.cut(text)])

Model loaded succeed


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

transformer = TfidfVectorizer(vocabulary=word_list)
    
def text_features(corpus):
    tfidf = transformer.fit_transform(corpus)
    return tfidf.toarray()

text_features(["我 今天 赚了 很多 钱",
               "我的 股票 又 升值 了！"])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
# 需要分词

in_dir = "data/Mengli/weibo_data_analysis-20210210/weibo_data_json"
vectorizer = TfidfVectorizer(vocabulary=word_list)
users_corpus = []

for i, in_name in enumerate(tqdm(Path(in_dir).rglob("*.json"))):
    # print(in_name)
    d = json.load(open(in_name))
    # print(d["weibo"])
    u_corpus = " ".join([make_segs(w['content']) for w in d["weibo"]])
    # print(u_corpus)
    users_corpus.append(u_corpus)

print(len(users_corpus))
X = text_features(users_corpus)
print(X.shape)

0it [00:00, ?it/s]

459
(459, 3749)


In [143]:
import re

word_list = ["抗疫", "疫情", "肺炎"]
keywords_pattern = "|".join(word_list)
keywords_pattern

# re.findall(keywords_pattern, "我们正在努力抗击疫情")
# re.findall(keywords_pattern, "首经贸欢迎您")
# print(re.search(keywords_pattern, "首经贸欢迎您") is None)
# print(re.search(keywords_pattern, "我们正在努力抗击疫情") is None)

def bingo_keywords(doc):
    return " ".join(re.findall(keywords_pattern, doc))

# def bingo_keywords(doc):
#     return 1 if len(re.findall(keywords_pattern, doc)) > 0 else 0

# bingo_keywords("我们正在努力抗击疫情")

1

In [52]:
# 不用分词，直接统计关键词表里面的词出现的数量
# text_features(["爱心", "财富", "4 5 6"])

in_dir = "data/Mengli/weibo_data_analysis-20210210/weibo_data_json"

vectorizer = TfidfVectorizer(vocabulary=word_list)

users_corpus = []

for i, in_name in enumerate(tqdm(Path(in_dir).rglob("*.json"))):
    # print(in_name)
    d = json.load(open(in_name))
    # print(d["weibo"])
    all_weibo = " ".join([w['content'] for w in d["weibo"]])
    u_corpus = bingo_keywords(all_weibo)
    # print(u_corpus)
    users_corpus.append(u_corpus)
    # break

print(len(users_corpus))
X = text_features(users_corpus)
print(X.shape)

0it [00:00, ?it/s]

459
(459, 3749)


In [76]:
text_df = pd.DataFrame(X, columns=["tfidf"+str(i+1) for i in range(X.shape[1])])
text_df = text_df.loc[:, (text_df != 0).any(axis=0)]
text_df.index = [in_name.name[:-5] for in_name in Path(in_dir).rglob("*.json")]
text_df.index.name = "uid"
text_df.to_csv("data/Mengli/tfidf_features.csv", float_format="%.6f")

In [147]:
'''
Basic features. Basic features are selected to reﬂect the user’s demographics, preliminary statuses and elementary interactions on social media, including gender, tweeting patterns and privacy settings. 

Speciﬁcally, tweeting patterns contain log(AUW + 1) (where AUW is the age of a user on Weibo in units of days), log(NT + 1) (where NT is the total number of tweets the user posted), log(NT/(AUW + 1)) (the frequency of posting), log(NFER + 1) (where NFER is deﬁned as the number of the user’s followers), log(NFEE + 1) (where NFEE denotes the number of the user’s followees), NT/(NFER + 1), and NT/(NFEE + 1). 

With respect to the privacy settings, corresponding binary features indicate whether a user allows comments from others, whether the user allows private messages sent from others and whether the user allows Weibo to track their real-time location. In addition, we consider the length of self-description as a feature.
'''
in_dir = "~/Papers/「论文」/合作/于孟利/weibo_data_analysis-20210210/weibo_data_json"


users_features = []

for in_name in Path(in_dir).rglob("*.json"):
    # print(in_name)
    d = json.load(open(in_name))
    user = d["user"]
    u = {
        "uid": user["id"],
        "b:gender": 1 if user["gender"]=="男" else 0,
        "b:NT": np.log(user["weibo_num"] + 1),
        "b:NFEE": np.log(user["following"] + 1),
        "b:NEER": np.log(user["followers"] + 1),
        "b:NT/NFEE": (user["weibo_num"] + 1) / (user["following"] + 1),
        "b:NT/NFER": (user["weibo_num"] + 1) / (user["followers"] + 1),
        "b:NEEE/NFER": (user["following"] + 1) / (user["followers"] + 1),
    }

    weibos = d["weibo"]
    # 所有微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos]
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"t:h_{i}"] = _d
    u["t:h_max"] = d_features.max()
    u["t:h_argmax"] = d_features.argmax()
    u["t:h_std"] = d_features.std()
    
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"t:w_{i}"] = _d
    u["t:h_max"] = w_features.max()
    u["t:h_argmax"] = w_features.argmax()
    u["t:h_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["t:interval_mean"] = 0
        u["t:interval_std"] = 0
    else:
        u["t:interval_mean"] = interval.mean()
        u["t:interval_std"] = interval.std()

    u["t:life_length"] = life_length(time_list)
    u["t:ave_d_num"] = user["weibo_num"] / u["t:life_length"]

    # 转发微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos if not w["original"]]
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"t:h_{i}"] = _d
    u["t:ret_h_max"] = d_features.max()
    u["t:ret_h_argmax"] = d_features.argmax()
    u["t:ret_h_std"] = d_features.std()
    
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"t:w_{i}"] = _d
    u["t:ret_h_max"] = w_features.max()
    u["t:ret_h_argmax"] = w_features.argmax()
    u["t:ret_h_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["t:ret_interval_mean"] = 0
        u["t:ret_interval_std"] = 0
    else:
        u["t:ret_interval_mean"] = interval.mean()
        u["t:ret_interval_std"] = interval.std()

    # 文本特征
    # 另外的文件

    users_features.append(u)
    # print(user)


# len(users_features)

df = pd.DataFrame(users_features).set_index("uid")
df.to_csv("data/MengLi_users_features.csv", float_format="%.4f")
df

Unnamed: 0_level_0,b:gender,b:NT,b:NFEE,b:NEER,b:NT/NFEE,b:NT/NFER,b:NEEE/NFER,t:h_0,t:h_1,t:h_2,...,t:w_6,t:interval_mean,t:interval_std,t:life_length,t:ave_d_num,t:ret_h_max,t:ret_h_argmax,t:ret_h_std,t:ret_interval_mean,t:ret_interval_std
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2982254865,0,6.063785,5.093750,5.407172,2.638037,1.928251,0.730942,9,1,0,...,37,42.171218,45.922626,2724,0.157489,37,6,7.284314,37.741751,49.003489
3943065337,0,9.169727,6.356108,7.021084,16.670139,8.573214,0.514286,1,1,0,...,0,1.733333,2.063783,41,234.170732,11,1,3.870348,1.980952,2.287326
3968712237,0,5.743003,5.860786,5.176150,0.888889,1.762712,1.983051,1,0,0,...,8,22.461227,29.749968,683,0.455344,25,0,5.229430,44.960063,42.473738
5013416876,1,6.495266,5.852202,5.402677,1.902299,2.981982,1.567568,14,5,1,...,19,38.791653,35.454522,2334,0.283205,36,1,6.577637,64.989949,47.067363
5230736039,0,4.976734,5.347108,4.820282,0.690476,1.169355,1.693548,3,0,0,...,5,30.069771,36.605034,1078,0.133581,19,2,6.220440,27.297083,47.639542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1278570913,0,5.897154,5.293305,4.094345,1.829146,6.066667,3.316667,6,8,2,...,9,17.561508,30.821601,2817,0.128860,58,4,16.061108,16.154988,29.744955
5236316239,0,7.216709,5.517453,4.189655,5.469880,20.636364,3.772727,0,1,0,...,65,14.999903,34.455127,1738,0.783084,118,2,22.106422,14.536835,34.818574
2678428523,0,6.075346,4.304065,5.676754,5.878378,1.489726,0.253425,2,4,1,...,83,38.036657,45.881736,2635,0.164706,83,6,13.468027,38.672378,47.057762
3078158207,0,5.669881,5.552960,5.468060,1.124031,1.223629,1.088608,9,2,0,...,20,66.797996,51.967102,2803,0.103104,20,5,5.394631,54.015079,51.716291


In [8]:
survey_data = pd.read_excel("data/Mengli/456ID_data_0611.xlsx")
survey_data["userID_num"].astype("str")
survey_data = survey_data.set_index("userID_num")
survey_data.index.name = "uid"
survey_data

Unnamed: 0_level_0,num,提交答卷时间,所用时间,nickname,userURL,userID,sex,age,Born,专业,...,DV_Community,DV_Intrinsic_goal,DV_Extrinsic_goal,DV1_社交排斥,DV2_社交排斥,DV_恶意幽默,DV_内疚诱导,DV_depression,DV_anxiety,DV_stress
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5822697591,10,2019/10/20 16:34:17,856秒,满天小xingx,https://weibo.com/u/5822697591,u/5822697591,2,23,1996,4,...,4.50,6.375000,5.083333,2.571429,3.000000,1.625,2.833333,1.428571,3.333333,2.857143
5866763968,18,2019/10/20 16:42:25,883秒,不要放蘑菇,https://weibo.com/u/5866763968,u/5866763968,1,20,1999,4,...,6.00,6.354167,5.472222,2.000000,2.000000,2.000,2.000000,2.285714,2.333333,2.285714
5311953749,28,2019/10/20 16:49:03,1440秒,彭老师不爱留作业,https://weibo.com/u/5311953749,u/5311953749,1,24,1995,4,...,5.75,6.166667,4.944444,1.000000,1.000000,1.000,1.000000,1.142857,1.000000,1.000000
2366180120,31,2019/10/20 16:51:01,1024秒,cryptobiote,https://weibo.com/u/2366180120,u/2366180120,1,25,1994,1,...,4.75,6.208333,5.194444,1.000000,1.000000,1.000,1.000000,4.571429,3.500000,4.714286
2769435005,32,2019/10/20 16:51:13,1090秒,不是很紧,https://weibo.com/u/2769435005,u/2769435005,1,22,1997,1,...,5.00,6.291667,5.472222,2.142857,3.000000,1.750,3.666667,3.571429,2.833333,3.714286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2139304270,1450,2019/11/15 11:32:33,929秒,_怪兽回收处处长,https://weibo.com/u/2139304270,u/2139304270,1,21,1998,2,...,4.75,5.770833,4.138889,2.285714,4.000000,1.875,2.166667,3.571429,2.833333,3.428571
1822574395,1454,2019/11/15 11:48:23,1235秒,夷歆雯,https://weibo.com/u/1822574395,u/1822574395,2,21,1998,3,...,5.25,6.312500,6.277778,2.571429,2.666667,1.500,1.500000,2.000000,1.166667,1.285714
6249948493,1461,2019/11/16 14:54:01,1002秒,·松栗奶油,https://weibo.com/u/6249948493,u/6249948493,2,20,1999,4,...,4.75,5.583333,4.666667,1.000000,1.333333,1.875,1.000000,1.000000,1.000000,1.142857
6069038567,1465,2019/11/17 14:58:06,1446秒,玺欢你andme,https://weibo.com/u/6069038567,u/6069038567,2,22,1997,4,...,5.25,6.000000,4.111111,3.285714,2.000000,1.375,1.833333,1.428571,2.333333,3.000000


In [9]:
survey_data[[c for c in survey_data.columns.to_list() if c.startswith("DV_")]]

Unnamed: 0_level_0,DV_economics,DV_fame,DV_image,DV_relationship,DV_selfacceptance,DV_Health,DV_Community,DV_Intrinsic_goal,DV_Extrinsic_goal,DV_恶意幽默,DV_内疚诱导,DV_depression,DV_anxiety,DV_stress
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5822697591,6.75,4.000000,4.50,7.000000,7.000000,7.00,4.50,6.375000,5.083333,1.625,2.833333,1.428571,3.333333,2.857143
5866763968,7.00,4.666667,4.75,6.000000,6.666667,6.75,6.00,6.354167,5.472222,2.000,2.000000,2.285714,2.333333,2.285714
5311953749,5.50,4.333333,5.00,5.666667,7.000000,6.25,5.75,6.166667,4.944444,1.000,1.000000,1.142857,1.000000,1.000000
2366180120,6.50,5.333333,3.75,7.000000,6.333333,6.75,4.75,6.208333,5.194444,1.000,1.000000,4.571429,3.500000,4.714286
2769435005,6.25,5.666667,4.50,6.666667,7.000000,6.50,5.00,6.291667,5.472222,1.750,3.666667,3.571429,2.833333,3.714286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2139304270,5.75,3.666667,3.00,6.333333,6.000000,6.00,4.75,5.770833,4.138889,1.875,2.166667,3.571429,2.833333,3.428571
1822574395,6.00,6.333333,6.50,6.666667,6.333333,7.00,5.25,6.312500,6.277778,1.500,1.500000,2.000000,1.166667,1.285714
6249948493,5.00,4.000000,5.00,5.333333,6.000000,6.25,4.75,5.583333,4.666667,1.875,1.000000,1.000000,1.000000,1.142857
6069038567,5.50,3.333333,3.50,6.000000,6.000000,6.75,5.25,6.000000,4.111111,1.375,1.833333,1.428571,2.333333,3.000000


In [10]:
from sklearn.cluster import KMeans

for index, c in survey_data.iteritems():
    if not index.startswith("DV"):
        continue
    c = [[_c] for _c in c]
    y_pred = KMeans(n_clusters=2, random_state=42).fit_predict(c)
    print(Counter(y_pred))
    survey_data["c_2_" + index] = y_pred
    y_pred = KMeans(n_clusters=3, random_state=42).fit_predict(c)
    print(Counter(y_pred))
    survey_data["c_3_" + index] = y_pred

Counter({0: 269, 1: 187})
Counter({0: 228, 1: 157, 2: 71})
Counter({0: 256, 1: 200})
Counter({0: 210, 1: 194, 2: 52})
Counter({1: 250, 0: 206})
Counter({0: 249, 2: 111, 1: 96})
Counter({0: 309, 1: 147})
Counter({0: 221, 1: 211, 2: 24})
Counter({1: 307, 0: 149})
Counter({1: 204, 0: 150, 2: 102})
Counter({0: 315, 1: 141})
Counter({0: 227, 1: 192, 2: 37})
Counter({1: 262, 0: 194})
Counter({2: 219, 1: 133, 0: 104})
Counter({0: 304, 1: 152})
Counter({0: 204, 2: 191, 1: 61})
Counter({1: 231, 0: 225})
Counter({1: 224, 0: 148, 2: 84})
Counter({0: 319, 1: 137})
Counter({0: 254, 1: 166, 2: 36})
Counter({0: 237, 1: 219})
Counter({0: 185, 1: 148, 2: 123})
Counter({0: 357, 1: 99})
Counter({0: 262, 2: 150, 1: 44})
Counter({0: 293, 1: 163})
Counter({1: 238, 0: 159, 2: 59})
Counter({1: 248, 0: 208})
Counter({1: 195, 2: 160, 0: 101})
Counter({1: 293, 0: 163})
Counter({0: 204, 2: 161, 1: 91})
Counter({0: 271, 1: 185})
Counter({2: 168, 1: 162, 0: 126})


In [11]:
w_feas = pd.read_csv("data/MengLi/MengLi_users_features.csv")
w_feas["uid"].astype("str")
w_feas = w_feas.set_index("uid")
cols_feas1 = w_feas.columns
# w_feas

In [12]:
text_df = pd.read_csv("data/MengLi/tfidf_features.csv", index_col="uid")
cols_feas2 = text_df.columns
cols_feas2

Index(['tfidf2', 'tfidf5', 'tfidf6', 'tfidf7', 'tfidf8', 'tfidf9', 'tfidf10',
       'tfidf13', 'tfidf14', 'tfidf15',
       ...
       'tfidf3737', 'tfidf3738', 'tfidf3739', 'tfidf3740', 'tfidf3741',
       'tfidf3743', 'tfidf3744', 'tfidf3745', 'tfidf3746', 'tfidf3747'],
      dtype='object', length=2523)

In [13]:
cols_feas = list(cols_feas1) + list(cols_feas2) + ["e_愤怒", "e_厌恶", "e_高兴", "e_低落", "e_恐惧"]
# cols_feas

In [14]:
all_data = survey_data.join(w_feas).join(text_df).copy()

In [39]:
all_data

Unnamed: 0_level_0,num,提交答卷时间,所用时间,nickname,userURL,userID,sex,age,Born,专业,...,tfidf3737,tfidf3738,tfidf3739,tfidf3740,tfidf3741,tfidf3743,tfidf3744,tfidf3745,tfidf3746,tfidf3747
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5822697591,10,2019/10/20 16:34:17,856秒,满天小xingx,https://weibo.com/u/5822697591,u/5822697591,2,23,1996,4,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5866763968,18,2019/10/20 16:42:25,883秒,不要放蘑菇,https://weibo.com/u/5866763968,u/5866763968,1,20,1999,4,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5311953749,28,2019/10/20 16:49:03,1440秒,彭老师不爱留作业,https://weibo.com/u/5311953749,u/5311953749,1,24,1995,4,...,0.0,0.0,0.0,0.049926,0.0,0.0,0.0,0.0,0.0,0.0
2366180120,31,2019/10/20 16:51:01,1024秒,cryptobiote,https://weibo.com/u/2366180120,u/2366180120,1,25,1994,1,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2769435005,32,2019/10/20 16:51:13,1090秒,不是很紧,https://weibo.com/u/2769435005,u/2769435005,1,22,1997,1,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2139304270,1450,2019/11/15 11:32:33,929秒,_怪兽回收处处长,https://weibo.com/u/2139304270,u/2139304270,1,21,1998,2,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1822574395,1454,2019/11/15 11:48:23,1235秒,夷歆雯,https://weibo.com/u/1822574395,u/1822574395,2,21,1998,3,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6249948493,1461,2019/11/16 14:54:01,1002秒,·松栗奶油,https://weibo.com/u/6249948493,u/6249948493,2,20,1999,4,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6069038567,1465,2019/11/17 14:58:06,1446秒,玺欢你andme,https://weibo.com/u/6069038567,u/6069038567,2,22,1997,4,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X = all_data[cols_feas]
print("y cols:", [c for c in all_data.columns.to_list() if c.startswith("c_")])
y = all_data[[c for c in all_data.columns.to_list() if c.startswith("c_")]]

# X, y

y cols: ['c_2_DV_economics', 'c_3_DV_economics', 'c_2_DV_fame', 'c_3_DV_fame', 'c_2_DV_image', 'c_3_DV_image', 'c_2_DV_relationship', 'c_3_DV_relationship', 'c_2_DV_selfacceptance', 'c_3_DV_selfacceptance', 'c_2_DV_Health', 'c_3_DV_Health', 'c_2_DV_Community', 'c_3_DV_Community', 'c_2_DV_Intrinsic_goal', 'c_3_DV_Intrinsic_goal', 'c_2_DV_Extrinsic_goal', 'c_3_DV_Extrinsic_goal', 'c_2_DV1_社交排斥', 'c_3_DV1_社交排斥', 'c_2_DV2_社交排斥', 'c_3_DV2_社交排斥', 'c_2_DV_恶意幽默', 'c_3_DV_恶意幽默', 'c_2_DV_内疚诱导', 'c_3_DV_内疚诱导', 'c_2_DV_depression', 'c_3_DV_depression', 'c_2_DV_anxiety', 'c_3_DV_anxiety', 'c_2_DV_stress', 'c_3_DV_stress']


In [32]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [75]:
# from sklearn import tree
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
# from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
import numpy as np
# from sklearn.datasets import load_svmlight_file
# from sklearn.externals import joblib
from sklearn.metrics import f1_score
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

from collections import Counter

# 内外部目标
def train_goals():

    models = {
        "LR": linear_model.LogisticRegression(C=0.5, solver="newton-cg"),
        # "K-Neighbors": KNeighborsClassifier(),
        # "Naive Bayes": GaussianNB(),
        # "Random Forest": RandomForestClassifier(),
        # "SVM (C=0.5)": svm.SVC(C=0.5),
        # "SVM (C=1)": svm.SVC(C=1),
        # "SVM (C=2)": svm.SVC(C=2),
        # "SVM (C=4)": svm.SVC(C=4),
        # "SVM (C=8)": svm.SVC(C=8),
    }

    for model_name, clf in models.items():
        rsts = {}

        print(f'Model > {model_name}')
        for col_name, y_i in y.iteritems():
            if not col_name.endswith("goal") or "_2" in col_name:
                continue

            y_i = np.array(y_i)
            X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.7, random_state=12)

            print(col_name)
            clf.fit(X_train, y_train)
            train_perf = clf.score(X_train, y_train)
            # print(f'训练数据表现 = {clf.score(X_train, y_train) * 100:.2f}%')
            cv_perf = cross_val_score(clf, X_train, y_train, cv=5).mean()
            # print(f'5次5折交叉检验 = {cvs * 100:.2f}%')

            y_hat = clf.predict(X_test)
            print('预测结果 =', Counter(y_hat))
            print('实际结果 =', Counter(y_test))

            f1 = f1_score(y_test, y_hat, average='macro')
            # print(f'F1 score = {f1 * 100:.2f}%')
            rsts[col_name] = {
                "train dataset": train_perf,
                "cross validation": cv_perf,
                "test dataset (f1)": f1,
            }
            print("- " * 20, "\n")
        print("- * " * 20)
    
        rsts = pd.DataFrame(rsts) * 100
        rsts = rsts.T
        display(rsts)
        rsts.to_csv(f"data/rsts-ex-in-goals-model={model_name}.csv", float_format="%.4f")

In [76]:
train_goals()

Model > LR
c_3_DV_Intrinsic_goal
预测结果 = Counter({0: 161, 2: 96, 1: 63})
实际结果 = Counter({0: 139, 2: 138, 1: 43})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({1: 145, 0: 112, 2: 63})
实际结果 = Counter({1: 160, 0: 100, 2: 60})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_3_DV_Intrinsic_goal,86.029412,36.005291,37.009407
c_3_DV_Extrinsic_goal,90.441176,38.148148,29.80433


In [64]:
from sklearn.model_selection import GridSearchCV

# param_grid = [
#     {
#         'weights':['uniform'],
#         'n_neighbors':[i for i in range(1, 11)]
#     },
#     {
#         'weights':['distance'],
#         'n_neighbors':[i for i in range(1, 11)],
#         'p':[i for i in range(1, 6)]
#     }
# ]

param_grid = [
    {
        'solver': ["newton-cg"],
        'C': [2**i for i in range(-5, 5)]
    }
]

# "c_3_DV_Intrinsic_goal": Counter({2: 141, 0: 135, 1: 44})
# "c_3_DV_Extrinsic_goal": Counter({1: 153, 0: 103, 2: 64})

# clf = svm.LinearSVC()
clf = linear_model.LogisticRegression()
grid_search = GridSearchCV(clf, param_grid, n_jobs = -1, verbose = 2)

_y = np.array(y["c_3_DV_Intrinsic_goal"])
print(_y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, _y, test_size=0.7, random_state=43)

grid_search.fit(X, _y)

(456,)
Fitting 5 folds for each of 10 candidates, totalling 50 fits


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid=[{'C': [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8,
                                16],
                          'solver': ['newton-cg']}],
             verbose=2)

In [65]:
#返回一个 效果最好的分类器
clf = grid_search.best_params_
#正确率最高的成绩
grid_search.best_score_
#最好的参数
grid_search.best_params_

{'C': 4, 'solver': 'newton-cg'}

In [48]:
train_goals()

Model > LR
c_2_DV_Intrinsic_goal
预测结果 = Counter({0: 206, 1: 114})
实际结果 = Counter({0: 211, 1: 109})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Intrinsic_goal
预测结果 = Counter({0: 161, 2: 103, 1: 56})
实际结果 = Counter({2: 141, 0: 135, 1: 44})
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_Extrinsic_goal
预测结果 = Counter({1: 198, 0: 122})
实际结果 = Counter({0: 162, 1: 158})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({1: 149, 0: 119, 2: 52})
实际结果 = Counter({1: 153, 0: 103, 2: 64})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_2_DV_Intrinsic_goal,88.970588,55.820106,51.479175
c_3_DV_Intrinsic_goal,86.764706,44.074074,32.331266
c_2_DV_Extrinsic_goal,91.176471,58.068783,49.359076
c_3_DV_Extrinsic_goal,93.382353,34.497354,40.199985


Model > K-Neighbors
c_2_DV_Intrinsic_goal
预测结果 = Counter({0: 276, 1: 44})
实际结果 = Counter({0: 211, 1: 109})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Intrinsic_goal
预测结果 = Counter({0: 188, 2: 111, 1: 21})
实际结果 = Counter({2: 141, 0: 135, 1: 44})
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_Extrinsic_goal
预测结果 = Counter({1: 213, 0: 107})
实际结果 = Counter({0: 162, 1: 158})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({1: 185, 0: 133, 2: 2})
实际结果 = Counter({1: 153, 0: 103, 2: 64})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_2_DV_Intrinsic_goal,70.588235,61.719577,47.175585
c_3_DV_Intrinsic_goal,63.970588,41.164021,37.660718
c_2_DV_Extrinsic_goal,65.441176,45.555556,51.582681
c_3_DV_Extrinsic_goal,61.029412,49.285714,29.221409


Model > Naive Bayes
c_2_DV_Intrinsic_goal
预测结果 = Counter({1: 211, 0: 109})
实际结果 = Counter({0: 211, 1: 109})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Intrinsic_goal
预测结果 = Counter({1: 168, 2: 78, 0: 74})
实际结果 = Counter({2: 141, 0: 135, 1: 44})
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_Extrinsic_goal
预测结果 = Counter({0: 213, 1: 107})
实际结果 = Counter({0: 162, 1: 158})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({2: 166, 1: 88, 0: 66})
实际结果 = Counter({1: 153, 0: 103, 2: 64})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_2_DV_Intrinsic_goal,86.029412,47.804233,48.125
c_3_DV_Intrinsic_goal,83.823529,27.275132,31.516935
c_2_DV_Extrinsic_goal,93.382353,49.94709,47.51195
c_3_DV_Extrinsic_goal,84.558824,21.322751,34.272286


Model > Random Forest
c_2_DV_Intrinsic_goal
预测结果 = Counter({0: 317, 1: 3})
实际结果 = Counter({0: 211, 1: 109})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Intrinsic_goal
预测结果 = Counter({0: 245, 2: 67, 1: 8})
实际结果 = Counter({2: 141, 0: 135, 1: 44})
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_Extrinsic_goal
预测结果 = Counter({1: 184, 0: 136})
实际结果 = Counter({0: 162, 1: 158})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({1: 292, 0: 28})
实际结果 = Counter({1: 153, 0: 103, 2: 64})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_2_DV_Intrinsic_goal,100.0,66.904762,40.47619
c_3_DV_Intrinsic_goal,100.0,52.989418,28.967611
c_2_DV_Extrinsic_goal,100.0,57.433862,49.76255
c_3_DV_Extrinsic_goal,100.0,45.582011,26.212654


Model > SVM (C=0.5)
c_2_DV_Intrinsic_goal
预测结果 = Counter({0: 320})
实际结果 = Counter({0: 211, 1: 109})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Intrinsic_goal
预测结果 = Counter({0: 320})
实际结果 = Counter({2: 141, 0: 135, 1: 44})
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_Extrinsic_goal
预测结果 = Counter({1: 178, 0: 142})
实际结果 = Counter({0: 162, 1: 158})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({1: 320})
实际结果 = Counter({1: 153, 0: 103, 2: 64})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_2_DV_Intrinsic_goal,68.382353,68.386243,39.736347
c_3_DV_Intrinsic_goal,50.735294,50.740741,19.78022
c_2_DV_Extrinsic_goal,59.558824,58.015873,53.007519
c_3_DV_Extrinsic_goal,52.205882,52.195767,21.564482


Model > SVM (C=1)
c_2_DV_Intrinsic_goal
预测结果 = Counter({0: 320})
实际结果 = Counter({0: 211, 1: 109})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Intrinsic_goal
预测结果 = Counter({0: 320})
实际结果 = Counter({2: 141, 0: 135, 1: 44})
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_Extrinsic_goal
预测结果 = Counter({1: 185, 0: 135})
实际结果 = Counter({0: 162, 1: 158})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({1: 320})
实际结果 = Counter({1: 153, 0: 103, 2: 64})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_2_DV_Intrinsic_goal,68.382353,68.386243,39.736347
c_3_DV_Intrinsic_goal,50.735294,50.0,19.78022
c_2_DV_Extrinsic_goal,59.558824,58.042328,53.823954
c_3_DV_Extrinsic_goal,52.205882,52.195767,21.564482


Model > SVM (C=2)
c_2_DV_Intrinsic_goal
预测结果 = Counter({0: 320})
实际结果 = Counter({0: 211, 1: 109})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Intrinsic_goal
预测结果 = Counter({0: 318, 2: 2})
实际结果 = Counter({2: 141, 0: 135, 1: 44})
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_Extrinsic_goal
预测结果 = Counter({1: 191, 0: 129})
实际结果 = Counter({0: 162, 1: 158})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({1: 320})
实际结果 = Counter({1: 153, 0: 103, 2: 64})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_2_DV_Intrinsic_goal,68.382353,68.386243,39.736347
c_3_DV_Intrinsic_goal,50.735294,49.259259,19.720383
c_2_DV_Extrinsic_goal,59.558824,58.042328,54.942447
c_3_DV_Extrinsic_goal,52.205882,52.195767,21.564482


Model > SVM (C=4)
c_2_DV_Intrinsic_goal
预测结果 = Counter({0: 320})
实际结果 = Counter({0: 211, 1: 109})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Intrinsic_goal
预测结果 = Counter({0: 307, 2: 13})
实际结果 = Counter({2: 141, 0: 135, 1: 44})
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_Extrinsic_goal
预测结果 = Counter({1: 195, 0: 125})
实际结果 = Counter({0: 162, 1: 158})
- - - - - - - - - - - - - - - - - - - -  

c_3_DV_Extrinsic_goal
预测结果 = Counter({1: 320})
实际结果 = Counter({1: 153, 0: 103, 2: 64})
- - - - - - - - - - - - - - - - - - - -  

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * 


Unnamed: 0,train dataset,cross validation,test dataset (f1)
c_2_DV_Intrinsic_goal,68.382353,68.386243,39.736347
c_3_DV_Intrinsic_goal,52.205882,48.518519,23.372706
c_2_DV_Extrinsic_goal,59.558824,57.328042,54.832151
c_3_DV_Extrinsic_goal,52.205882,48.492063,21.564482


针对以下变量进行分析

DV_economics
DV_fame
DV_image
DV_relationship
DV_selfacceptance
DV_Health
DV_Community
DV_Intrinsic_goal
DV_Extrinsic_goal
DV1_社交排斥
DV2_社交排斥
DV_恶意幽默
DV_内疚诱导
DV_depression
DV_anxiety
DV_stress

In [None]:
def train():
    # print(in_name)
    C = 0.5

    for col_name, y_i in y.iteritems():
        y_i = np.array(y_i)
        X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.7, random_state=23)

#         gamma = 0.5
#         clf = SVC(C=C, probability=True)
        if "_2" in col_name:
            print(col_name)
            clf = linear_model.LogisticRegression(C=C, solver="newton-cg")

    #         clf = RandomForestClassifier()
            clf.fit(X_train, y_train)
            print('训练数据上的表现 =', clf.score(X_train, y_train))
            cvs = cross_val_score(clf, X_train, y_train, cv=5).mean()
            print('5次5折交叉检验 =', cvs)
            y_hat = clf.predict(X_test)
            print('预测结果 =', Counter(y_hat))
            print('实际结果 =', Counter(y_test))
            f1 = f1_score(y_test, y_hat, average='macro')
            print("✔" if f1 > 0.52 else "✘", 'F1 score =', f1)
            print("- " * 20, "\n")

    for col_name, y_i in y.iteritems():
        y_i = np.array(y_i)
        X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.7, random_state=23)
#         gamma = 0.5
#         clf = SVC(C=C, probability=True)
        if "_3" in col_name:
            print(col_name)
            clf = linear_model.LogisticRegression(C=C, solver="newton-cg")

    #         clf = RandomForestClassifier()
            clf.fit(X_train, y_train)
            print('训练数据上的表现 =', clf.score(X_train, y_train))
            cvs = cross_val_score(clf, X_train, y_train, cv=5).mean()
            print('5次5折交叉检验 =', cvs)
            y_hat = clf.predict(X_test)
            print('预测结果 =', Counter(y_hat))
            print('实际结果 =', Counter(y_test))
            f1 = f1_score(y_test, y_hat, average='macro')
            print("✔" if f1 > 0.35 else "✘", 'F1 score =', f1)
            print("- " * 20, "\n")

In [44]:
train()

c_2_DV_economics
训练数据上的表现 = 0.8455882352941176
5次5折交叉检验 = 0.5515873015873016
预测结果 = Counter({0: 182, 1: 138})
实际结果 = Counter({0: 185, 1: 135})
✔ F1 score = 0.5560479484185207
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_fame
训练数据上的表现 = 0.8823529411764706
5次5折交叉检验 = 0.5447089947089947
预测结果 = Counter({0: 165, 1: 155})
实际结果 = Counter({0: 180, 1: 140})
✘ F1 score = 0.48749692950135104
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_image
训练数据上的表现 = 0.9117647058823529
5次5折交叉检验 = 0.6187830687830688
预测结果 = Counter({1: 177, 0: 143})
实际结果 = Counter({1: 170, 0: 150})
✔ F1 score = 0.5310363820558468
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_relationship
训练数据上的表现 = 0.8676470588235294
5次5折交叉检验 = 0.5727513227513227
预测结果 = Counter({0: 210, 1: 110})
实际结果 = Counter({0: 217, 1: 103})
✘ F1 score = 0.5039086980901805
- - - - - - - - - - - - - - - - - - - -  

c_2_DV_selfacceptance
训练数据上的表现 = 0.8161764705882353
5次5折交叉检验 = 0.6317460317460317
预测结果 = Counter({1: 221, 0: 99})
实际结果 = Counter({1: 