In [None]:
import pandas as pd
from pathlib import Path
import json
import pendulum
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter

In [None]:
def day_night(time_list):
    '''
    发微博的时间序列转化为24个时间段的统计
    :param in_name:
    :param out_name:
    :return: 
    '''
    cnt = [0] * 24
    for t in time_list:
        hour = t.hour
        # print(hour)
        # 分为四个时间段
        # cnt[int(int(hour) / 6)] += 1
        # 分为24小时
        cnt[int(hour)] += 1
    return cnt


def weeks(time_list):
    '''
    发微博的时间序列转化为7个时间段的统计
    :param in_name:
    :param out_name:
    :return:
    '''
    cnt = [0] * 7
    for t in time_list:
        weekday = t.weekday()
        cnt[int(weekday)] += 1
    return cnt


def time_interval(time_list):
    interval = []
    for i in np.arange(1, len(time_list)):
        t2 = time_list[i-1]
        t1 = time_list[i]
        _inter = abs((t2 - t1).total_seconds())
        if _inter < 86400 * 7:
            interval.append(_inter / 3600)
    # return interval
    interval = np.array(interval)
    # print(interval)
    return interval


def life_length(time_list):
    return int((max(time_list) - min(time_list)).total_seconds() / 3600 / 24) + 1

In [None]:
users_features = []

for in_name in Path(in_dir).rglob("*.json"):
    # print(in_name)
    d = json.load(open(in_name))
    user = d["user"]
    u = {
        "uid": user["id"],
        "gender": 1 if user["gender"]=="男" else 0,
        "have_talent": int(user["talent"] != ""),
        "have_education": int(user["education"] != ""),
        "have_work": int(user["work"] != ""),
        "NT": np.log(user["weibo_num"] + 1),
        "NFEE": np.log(user["following"] + 1),
        "NEER": np.log(user["followers"] + 1),
        "NT/NFEE": np.log((user["weibo_num"] + 1) / (user["following"] + 1)),
        "NT/NFER": np.log((user["weibo_num"] + 1) / (user["followers"] + 1)),
        "NEEE/NFER": np.log((user["following"] + 1) / (user["followers"] + 1)),
        "len_desc": len(user["description"]),
    }
    weibos = d["weibo"]
    # 所有微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos]
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"h_{i}"] = _d
    u["h_max"] = d_features.max()
    u["h_argmax"] = d_features.argmax()
    u["h_std"] = d_features.std()
    
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"w_{i}"] = _d
    u["w_max"] = w_features.max()
    u["w_argmax"] = w_features.argmax()
    u["w_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["interval_mean"] = 0
        u["interval_std"] = 0
    else:
        u["interval_mean"] = interval.mean()
        u["interval_std"] = interval.std()

    u["life_length"] = np.log(life_length(time_list) + 1)
    u["ave_d_num"] = user["weibo_num"] / u["life_length"]

    # 转发微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos if not w["original"]]
    u["ret_prop"] = len(time_list) / (user["weibo_num"] + 1)
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"ret_h_{i}"] = _d
    u["ret_h_max"] = d_features.max()
    u["ret_h_argmax"] = d_features.argmax()
    u["ret_h_std"] = d_features.std()
     
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"ret_w_{i}"] = _d
    u["ret_w_max"] = w_features.max()
    u["ret_w_argmax"] = w_features.argmax()
    u["ret_w_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["ret_interval_mean"] = 0
        u["ret_interval_std"] = 0
    else:
        u["ret_interval_mean"] = interval.mean()
        u["ret_interval_std"] = interval.std()
        
    # 提及（@）微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos if "@" in w["content"]]
    u["men_prop"] = len(time_list) / (user["weibo_num"] + 1)
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"men_h_{i}"] = _d
    u["men_h_max"] = d_features.max()
    u["men_h_argmax"] = d_features.argmax()
    u["men_h_std"] = d_features.std()
    
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"men_w_{i}"] = _d
    u["men_w_max"] = w_features.max()
    u["men_w_argmax"] = w_features.argmax()
    u["men_w_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["men_interval_mean"] = 0
        u["men_interval_std"] = 0
    else:
        u["men_interval_mean"] = interval.mean()
        u["men_interval_std"] = interval.std()
        
    # 文本特征
    # 另外的文件
    users_features.append(u)
    # print(user)
    
# len(users_features)

df = pd.DataFrame(users_features).set_index("uid")
df.to_csv("csv", float_format="%.4f")
df

In [None]:
# 文本分析
from vocab import vocab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

word_list = vocab.community_keywords \
            + vocab.financial_keywords \
            + vocab.popularity_keywords \
            + vocab.image_keywords \
            + vocab.health_keywords \
            + vocab.affiliation_keywords \
            + vocab.selfacceptance_keywords
word_list = list(set(word_list))
word_list.sort(key=lambda x: len(x), reverse=True)
# print(word_list)
print(len(word_list))
with open("vocab/word_list.txt", "w") as f:
    for w in word_list:
        f.write(w + "\n")

In [None]:
vectorizer = CountVectorizer(min_df=10)
print(len(users_corpus))
tfidf = vectorizer.fit_transform(users_corpus)
X = tfidf.toarray()
# print(X.sum(axis=0))
print(X.shape)

text_df = pd.DataFrame(X, columns=["wc"+str(i+1) for i in range(X.shape[1])])
text_df = text_df.loc[:, (text_df != 0).any(axis=0)]
text_df.index = [in_name.name[:-5] for in_name in Path(in_dir).rglob("*.json")]
text_df.index.name = "uid"
text_df.to_csv("csv")

vectorizer = TfidfVectorizer(min_df=10)
print(vectorizer.vocabulary_())
print(len(users_corpus))
tfidf = vectorizer.fit_transform(users_corpus)
X = tfidf.toarray()
print(X.shape)

text_df = pd.DataFrame(X, columns=["tfidf"+str(i+1) for i in range(X.shape[1])])
text_df = text_df.loc[:, (text_df != 0).any(axis=0)]
text_df.index = [in_name.name[:-5] for in_name in Path(in_dir).rglob("*.json")]
text_df.index.name = "uid"
text_df.to_csv("csv", float_format="%.4f")

In [None]:
# 载入问卷数据
survey_data = pd.read_excel("xlsx")
survey_data["userID_num"].astype("str")
survey_data = survey_data.set_index("userID_num")
survey_data.index.name = "uid"
survey_data

In [None]:
# survey_data[[c for c in survey_data.columns.to_list() if c.startswith("DV_") or c.startswith("e_")]]
survey_data = survey_data[[c for c in survey_data.columns.to_list() if "@" not in c]]

In [None]:
from sklearn.cluster import KMeans

for index, c in survey_data.iteritems():
    if not index.startswith("DV"):
        continue
    print(index)
    c = [[_c] for _c in c]
    
    y_pred = KMeans(n_clusters=2, random_state=42).fit_predict(c)
    print(Counter(y_pred))
    survey_data.loc[:, "C2" + index] = y_pred
    
    y_pred = KMeans(n_clusters=3, random_state=42).fit_predict(c)
    print(Counter(y_pred))
    survey_data.loc[:, "C3" + index] = y_pred
    
    print("-" * 20)

In [None]:
survey_data.to_csv("csv")