# 特征提取

### 用户筛选和基本信息提取

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from tqdm.notebook import tqdm
import pendulum
from collections import Counter

In [None]:
# 抽取用户时间特征

def day_night(time_list):
    '''
    发微博的时间序列转化为24个时间段的统计
    :param in_name:
    :param out_name:
    :return: 
    '''
    cnt = [0] * 24
    for t in time_list:
        hour = t.hour
        # print(hour)
        # 分为四个时间段
        # cnt[int(int(hour) / 6)] += 1
        # 分为24小时
        cnt[int(hour)] += 1
    return cnt


def weeks(time_list):
    '''
    发微博的时间序列转化为7个时间段的统计
    :param in_name:
    :param out_name:
    :return:
    '''
    cnt = [0] * 7
    for t in time_list:
        weekday = t.weekday()
        cnt[int(weekday)] += 1
    return cnt


def time_interval(time_list):
    interval = []
    for i in np.arange(1, len(time_list)):
        t2 = time_list[i-1]
        t1 = time_list[i]
        _inter = abs((t2 - t1).total_seconds())
        if _inter < 86400 * 7:
            interval.append(_inter / 3600)
    # return interval
    interval = np.array(interval)
    # print(interval)
    return interval


def life_length(time_list):
    return int((max(time_list) - min(time_list)).total_seconds() / 3600 / 24) + 1

In [None]:
in_dir = "data/weibo_data_456_json"

users_n = []

# 筛选出发微博超过20条的，且最近一条微博为2020年以后发布，且第一条与最后一条微博间隔天数超过60天的用户
for i, in_name in enumerate(tqdm(Path(in_dir).rglob("*.json"))):
    d = json.load(open(in_name,encoding='utf-8'))
    user = d["user"]
    weibos = d["weibo"]
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos]
    recent = time_list[0]
    span = (time_list[0]-time_list[len(time_list)-1]).days
    if len(weibos)>=20 and recent.year>=2020 and span>=60:
        users_n.append(user["id"])

len(users_n)  #一共320个用户满足条件

In [None]:
# 提取用户的微博其他特征：基本信息，所有微博，转发微博，提及微博，发图的数量和比例

in_dir = "data/weibo_data_456_json"

users_features = []

for i, in_name in enumerate(tqdm(Path(in_dir).rglob("*.json"))):
    if not in_name.name[:-5] in users_n:
        continue
    d = json.load(open(in_name,encoding='utf-8'))
    user = d["user"]
    u = {
        "uid": user["id"],
        "gender": 1 if user["gender"]=="男" else 0,
        "have_talent": int(user["talent"] != ""),
        "have_education": int(user["education"] != ""),
        "have_work": int(user["work"] != ""),
        "NT": np.log(user["weibo_num"] + 1),
        "NFEE": np.log(user["following"] + 1),
        "NEER": np.log(user["followers"] + 1),
        "NT/NFEE": np.log((user["weibo_num"] + 1) / (user["following"] + 1)),
        "NT/NFER": np.log((user["weibo_num"] + 1) / (user["followers"] + 1)),
        "NEEE/NFER": np.log((user["following"] + 1) / (user["followers"] + 1)),
        "len_desc": len(user["description"]),
    }
    weibos = d["weibo"]

    
    # 所有微博
    #d_features 日时间特征
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos]
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"h_{i}"] = _d
    u["h_max"] = d_features.max()
    u["h_argmax"] = d_features.argmax()
    u["h_std"] = d_features.std()
    #w_features 周时间特征
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"w_{i}"] = _d
    u["w_max"] = w_features.max()
    u["w_argmax"] = w_features.argmax()
    u["w_std"] = w_features.std()
    #interval 发博间隔时间特征
    interval = time_interval(time_list)
    if len(interval) < 1:
        u["interval_mean"] = 0
        u["interval_std"] = 0
    else:
        u["interval_mean"] = interval.mean()
        u["interval_std"] = interval.std()

    u["life_length"] = np.log(life_length(time_list) + 1)
    u["ave_d_num"] = user["weibo_num"] / u["life_length"]

    
    # 转发微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos if not w["original"]]
    u["ret_prop"] = len(time_list) / (user["weibo_num"] + 1)
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"ret_h_{i}"] = _d
    u["ret_h_max"] = d_features.max()
    u["ret_h_argmax"] = d_features.argmax()
    u["ret_h_std"] = d_features.std()
     
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"ret_w_{i}"] = _d
    u["ret_w_max"] = w_features.max()
    u["ret_w_argmax"] = w_features.argmax()
    u["ret_w_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["ret_interval_mean"] = 0
        u["ret_interval_std"] = 0
    else:
        u["ret_interval_mean"] = interval.mean()
        u["ret_interval_std"] = interval.std()
        
    # 提及（@）微博
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos if "@" in w["content"]]
    u["men_prop"] = len(time_list) / (user["weibo_num"] + 1)
    d_features = np.array(day_night(time_list))
    for i, _d in enumerate(d_features):
        u[f"men_h_{i}"] = _d
    u["men_h_max"] = d_features.max()
    u["men_h_argmax"] = d_features.argmax()
    u["men_h_std"] = d_features.std()
    
    w_features = np.array(weeks(time_list))
    for i, _d in enumerate(w_features):
        u[f"men_w_{i}"] = _d
    u["men_w_max"] = w_features.max()
    u["men_w_argmax"] = w_features.argmax()
    u["men_w_std"] = w_features.std()

    interval = time_interval(time_list)
    if len(interval) < 1:
        u["men_interval_mean"] = 0
        u["men_interval_std"] = 0
    else:
        u["men_interval_mean"] = interval.mean()
        u["men_interval_std"] = interval.std()
        
    # 发图数量和比例
    pic_num = 0
    pic_y = 0
    for w in weibos:
        if w["original_pictures"] != "无": 
            pic_y += 1
            pic_num += len(w["original_pictures"].split(","))
    pic_prop = pic_y/len(weibos)
    u["pic_num"] = np.log(pic_num+1)
    u["pic_prop"] = pic_prop    
    
    
    # 文本特征
    # 另外的文件
    users_features.append(u)
    # print(user)
    
# len(users_features)

df = pd.DataFrame(users_features).set_index("uid")
#df.to_csv("features_202207/users320_b+t_features_202210.csv", float_format="%.4f")
df

### 文本特征提取

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import json
import jieba
from pandas.core.frame import DataFrame
from tqdm.notebook import tqdm
import pendulum

In [None]:
# 把词库读成字典形式
wl = []   #储存词
vecl = []   #储存词向量
weibo_word = open("sgns.weibo.word/sgns.weibo.word",encoding='utf-8')
next(weibo_word)     #第一行数据信息不要
line = weibo_word.readline()   #一行一行读入
while line:
    list_w = line.split()
    wl.append(list_w[0])
    vec = []
    for i in range(1,301):
        vec.append(float(list_w[i]))
    vecl.append(vec)
    line = weibo_word.readline()
weibo_word.close()


word_dict = dict(zip(wl,vecl))

In [None]:
in_dir = "data/weibo_data_456_json"

users_n = []

# 筛选出发微博超过20条的，且最近一条微博为2020年以后发布，且第一条与最后一条微博间隔天数超过60天的用户
for i, in_name in enumerate(tqdm(Path(in_dir).rglob("*.json"))):
    d = json.load(open(in_name,encoding='utf-8'))
    user = d["user"]
    weibos = d["weibo"]
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos]
    recent = time_list[0]
    span = (time_list[0]-time_list[len(time_list)-1]).days
    if len(weibos)>=20 and recent.year>=2020 and span>=60:
        users_n.append(user["id"])

In [None]:
#读取原微博文件
in_dir = "data/weibo_data_456_json"

a = []
b = []
for i, in_name in enumerate(Path(in_dir).rglob("*.json")):
    if not in_name.name[:-5] in users_n:
        continue
    d = json.load(open(in_name,encoding='utf-8'))
    uid = d["user"]["id"]
    all_weibo = " ".join([w['content'] for w in d["weibo"]])
    all_desc = " ".join(d["user"]["description"])
    all_weibo_desc = all_weibo +" "+ all_desc
    a.append(uid)
    b.append(all_weibo_desc)

In [None]:
c={"uid" : a,
   "content" : b} #将列表a，b转换成字典
users_weibo = DataFrame(c) #将字典转换成为数据框
content = users_weibo.iloc[:,1] #提取微博内容series

In [None]:
#jieba分词，生成分好词的二维列表
seg_list = []
for i in range(len(content)):
    seg = jieba.lcut(content[i])
    seg_list.append(seg)

In [None]:
#每个句子所有词的特征向量求和再求平均
vec_j = [0]*300
word_vec1 = []
for i in range(len(seg_list)):
    for j in range(len(seg_list[i])):
        if seg_list[i][j] in word_dict.keys():
            vec_j = np.sum([vec_j, word_dict[seg_list[i][j]]], axis = 0).tolist()
    vec_j = np.divide(vec_j, len(seg_list[i])).tolist()
    word_vec1.append(vec_j)
    vec_j = [0]*300

In [None]:
# 把word_vec1二维列表转换为数据框
word_v = DataFrame(word_vec1)
word_v.columns = ["vec"+str(i+1) for i in range(word_v.shape[1])]
word_v.index = users_n
word_v.index.name = "uid"
#word_v.to_csv("features_202207/users320_vec_mean_features_202210.csv")

### 数据按均值标准差分类

In [None]:
import pandas as pd
import numpy as np
from numpy import *
from collections import Counter
from tqdm.notebook import tqdm
import pendulum
from pathlib import Path
import json

In [None]:
in_dir = "data/weibo_data_456_json"

users_n = []

# 筛选出发微博超过20条的，且最近一条微博为2020年以后发布，且第一条与最后一条微博间隔天数超过60天的用户
for i, in_name in enumerate(tqdm(Path(in_dir).rglob("*.json"))):
    d = json.load(open(in_name,encoding='utf-8'))
    user = d["user"]
    weibos = d["weibo"]
    time_list = [pendulum.parse(w["publish_time"]) for w in weibos]
    recent = time_list[0]
    span = (time_list[0]-time_list[len(time_list)-1]).days
    if len(weibos)>=20 and recent.year>=2020 and span>=60:
        users_n.append(user["id"])


In [None]:
# 载入问卷数据
survey_data = pd.read_excel("data/456ID_data_0611.xlsx")
survey_data["userID_num"].astype("str")
survey_data = survey_data.set_index("userID_num")
survey_data.index.name = "uid"

users_nn = [int(i) for i in users_n]
survey_data = survey_data[survey_data.index.isin(users_nn)]
#survey_data

In [None]:
survey_data = survey_data[[c for c in survey_data.columns.to_list() if "@" not in c]]
#survey_data

In [None]:
y_pred = [None]*320
for index, c in survey_data.items():
    if not index.startswith("DV"):
        continue
    print(index)
    c = [[_c] for _c in c]
    
    for i in range(0,len(c)):
        if c[i] < mean(c)-std(c)/2:
            y_pred[i] = 0
        elif c[i] > mean(c)+std(c)/2:
            y_pred[i] = 2
        else:
            y_pred[i] = 1

    print(Counter(y_pred))
    survey_data.loc[:, "C3" + index] = y_pred
    
    print("-" * 20)
    
#survey_data.to_csv("features_202207/users320_survey_emotion_clas_202210.csv")
#survey_data.to_csv("features_202207/users320_survey_emotion_clas_202210_gbk.csv",encoding='gbk')

# 模型训练

In [None]:
import pandas as pd
import numpy as np

In [None]:
w_feas = pd.read_csv("features_202207/users320_b+t_features_202210.csv")
w_feas["uid"].astype("str")
w_feas = w_feas.set_index("uid")
cols_feas1 = w_feas.columns
#w_feas
text_df = pd.read_csv("features_202207/users320_vec_mean_features_202210.csv", index_col="uid")
cols_feas2 = text_df.columns
#cols_feas2
cols_feas = list(cols_feas1) + list(cols_feas2)
survey_data = pd.read_csv("features_202207/users320_survey_emotion_clas_202210.csv", index_col="uid")
all_data = survey_data.join(w_feas).join(text_df).copy()
all_data

In [None]:
X = all_data[cols_feas]
print("y cols:", [c for c in all_data.columns.to_list() if c.startswith("C3")])
y = all_data[[c for c in all_data.columns.to_list() if c.startswith("C3")]]

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
from collections import Counter
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     cross_validate, train_test_split)
from sklearn.metrics import auc, classification_report, f1_score, roc_curve

from sklearn import linear_model,svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def train_sjpc():

    models = {
        "LR": linear_model.LogisticRegression(C=1, solver="newton-cg"),
        "K-Neighbors": KNeighborsClassifier(n_neighbors= 3),
        "SVM": svm.SVC(C=6, probability=True),
        "Random Forest": RandomForestClassifier(n_estimators = 4, max_depth = 5),
        "NN": MLPClassifier((64, 32), max_iter=100, solver='lbfgs')
    }

    for model_name, clf in models.items():
        rsts = {}
        print("*" * 100)
        print(f'Model: {model_name}')

        for col_name, y_i in y.iteritems():
            if not  col_name.endswith("社交排斥"):
                continue
            
            y_i = np.array(y_i)
            X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.3, random_state=12)
            
            print(col_name)
            clf.fit(X_train,y_train)
            train_perf = clf.score(X_train, y_train)
            cv_perf = cross_val_score(clf, X_train, y_train, cv=10).mean()

            y_hat = clf.predict(X_test)
            print('预测结果 =', Counter(y_hat))
            print('实际结果 =', Counter(y_test))            
            f1 = f1_score(y_test, y_hat, average='macro')

            metrics = ('accuracy', 'roc_auc_ovr') 
            scores = cross_validate(clf, X, y_i, cv=10, scoring=metrics)
            
            rsts[col_name] = {
                "train dataset": train_perf,
                "cross validation": cv_perf,
                "test dataset (f1)": f1,
                "accuracy": scores['test_accuracy'].mean(),
                "roc_auc_ovr": scores['test_roc_auc_ovr'].mean()                
            }
            
        print("- * " * 20)
        
        rsts = pd.DataFrame(rsts) * 100
        rsts = rsts.T
        display(rsts)
        
        #rsts.to_csv(f"result2210/rsts1-sjpc-model={model_name}.csv", float_format="%.4f", encoding="gbk")

In [None]:
train_sjpc()

In [None]:
def train_eyym():

    models = {
        "LR": linear_model.LogisticRegression(C=1, solver="newton-cg"),
        "K-Neighbors": KNeighborsClassifier(n_neighbors= 3),
        "SVM": svm.SVC(C=6, probability=True),
        "Random Forest": RandomForestClassifier(n_estimators = 4, max_depth = 5),
        "NN": MLPClassifier((64, 32), max_iter=100, solver='lbfgs')
    }

    for model_name, clf in models.items():
        rsts = {}
        print("*" * 100)
        print(f'Model: {model_name}')

        for col_name, y_i in y.iteritems():
            if not  col_name.endswith("恶意幽默"):
                continue
            
            y_i = np.array(y_i)
            X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.3, random_state=12)
            
            print(col_name)
            clf.fit(X_train,y_train)
            train_perf = clf.score(X_train, y_train)
            cv_perf = cross_val_score(clf, X_train, y_train, cv=10).mean()

            y_hat = clf.predict(X_test)
            print('预测结果 =', Counter(y_hat))
            print('实际结果 =', Counter(y_test))            
            f1 = f1_score(y_test, y_hat, average='macro')

            metrics = ('accuracy', 'roc_auc_ovr') 
            scores = cross_validate(clf, X, y_i, cv=10, scoring=metrics)
            
            rsts[col_name] = {
                "train dataset": train_perf,
                "cross validation": cv_perf,
                "test dataset (f1)": f1,
                "accuracy": scores['test_accuracy'].mean(),
                "roc_auc_ovr": scores['test_roc_auc_ovr'].mean()                
            }
            
        print("- * " * 20)
        
        rsts = pd.DataFrame(rsts) * 100
        rsts = rsts.T
        display(rsts)
        
        #rsts.to_csv(f"result2210/rsts1-eyym-model={model_name}.csv", float_format="%.4f", encoding="gbk")

In [None]:
train_eyym()

In [None]:
def train_njyd():

    models = {
        "LR": linear_model.LogisticRegression(C=1, solver="newton-cg"),
        "K-Neighbors": KNeighborsClassifier(n_neighbors= 3),
        "SVM": svm.SVC(C=6, probability=True),
        "Random Forest": RandomForestClassifier(n_estimators = 4, max_depth = 5),
        "NN": MLPClassifier((64, 32), max_iter=100, solver='lbfgs')
    }

    for model_name, clf in models.items():
        rsts = {}
        print("*" * 100)
        print(f'Model: {model_name}')

        for col_name, y_i in y.iteritems():
            if not  col_name.endswith("内疚诱导"):
                continue
            
            y_i = np.array(y_i)
            X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.3, random_state=12)
            
            print(col_name)
            clf.fit(X_train,y_train)
            train_perf = clf.score(X_train, y_train)
            cv_perf = cross_val_score(clf, X_train, y_train, cv=10).mean()

            y_hat = clf.predict(X_test)
            print('预测结果 =', Counter(y_hat))
            print('实际结果 =', Counter(y_test))            
            f1 = f1_score(y_test, y_hat, average='macro')

            metrics = ('accuracy', 'roc_auc_ovr') 
            scores = cross_validate(clf, X, y_i, cv=10, scoring=metrics)
            
            rsts[col_name] = {
                "train dataset": train_perf,
                "cross validation": cv_perf,
                "test dataset (f1)": f1,
                "accuracy": scores['test_accuracy'].mean(),
                "roc_auc_ovr": scores['test_roc_auc_ovr'].mean()                
            }
            
        print("- * " * 20)
        
        rsts = pd.DataFrame(rsts) * 100
        rsts = rsts.T
        display(rsts)
        
        #rsts.to_csv(f"result2210/rsts1-njyd-model={model_name}.csv", float_format="%.4f", encoding="gbk")

In [None]:
train_njyd()