In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from collections import Counter
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     cross_validate, train_test_split)
from sklearn.metrics import auc, classification_report, f1_score, roc_curve

from sklearn import linear_model,svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore")

### 134个基本特征+词向量特征

In [None]:
w_feas = pd.read_csv("features_202207/users320_b+t_features_202210.csv")
w_feas["uid"].astype("str")
w_feas = w_feas.set_index("uid")
cols_feas1 = w_feas.columns
#w_feas
text_df = pd.read_csv("features_202207/users320_vec_mean_features_202210.csv", index_col="uid")
cols_feas2 = text_df.columns
#cols_feas2
cols_feas = list(cols_feas1) + list(cols_feas2)
survey_data = pd.read_csv("features_202207/users320_survey_emotion_clas_202210.csv", index_col="uid")
all_data = survey_data.join(w_feas).join(text_df).copy()
#all_data

In [None]:
len(cols_feas)

In [None]:
X = all_data[cols_feas]
print("y cols:", [c for c in all_data.columns.to_list() if c.startswith("C3")])
y = all_data[[c for c in all_data.columns.to_list() if c.startswith("C3")]]

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
def sjpc_train():

    models = {
        "LR": linear_model.LogisticRegression(C=1, solver="newton-cg"),
#        "K-Neighbors": KNeighborsClassifier(n_neighbors= 3),
        "SVM": svm.SVC(C=8, probability=True),
#        "Random Forest": RandomForestClassifier(n_estimators = 4, max_depth = 5),
        "NN": MLPClassifier((256,64), max_iter=80, solver='lbfgs')
    }

    for model_name, clf in models.items():
        rsts = {}
        print("*" * 100)
        print(f'Model: {model_name}')

        for col_name, y_i in y.iteritems():
            if not  col_name.endswith("DV2_社交排斥"):
                continue
            
            y_i = np.array(y_i)
            X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.3, random_state=12)
            
            print(col_name)
            clf.fit(X_train,y_train)
            train_perf = clf.score(X_train, y_train)
            cv_perf = cross_val_score(clf, X_train, y_train, cv=10).mean()

            y_hat = clf.predict(X_test)
#            print('预测结果 =', Counter(y_hat))
#            print('实际结果 =', Counter(y_test))            
            f1 = f1_score(y_test, y_hat, average='macro')

            metrics = ('accuracy', 'roc_auc_ovr') 
            scores = cross_validate(clf, X, y_i, cv=10, scoring=metrics)
            
            rsts[col_name] = {
                "train dataset": train_perf,
                "cross validation": cv_perf,
                "test dataset (f1)": f1,
                "accuracy": scores['test_accuracy'].mean(),
                "roc_auc_ovr": scores['test_roc_auc_ovr'].mean()                
            }
            
        print("- * " * 20)
        
        rsts = pd.DataFrame(rsts) * 100
        rsts = rsts.T
        display(rsts)
        
        #rsts.to_csv(f"result2210/rsts1-sjpc-model={model_name}.csv", float_format="%.4f", encoding="gbk")

In [None]:
sjpc_train()

In [None]:
def eyym_train():

    models = {
        "LR": linear_model.LogisticRegression(C=1, solver="newton-cg"),
#        "K-Neighbors": KNeighborsClassifier(n_neighbors= 3),
        "SVM": svm.SVC(C=8, probability=True),
#        "Random Forest": RandomForestClassifier(n_estimators = 4, max_depth = 5),
        "NN": MLPClassifier((256,64), max_iter=80, solver='lbfgs')
    }

    for model_name, clf in models.items():
        rsts = {}
        print("*" * 100)
        print(f'Model: {model_name}')

        for col_name, y_i in y.iteritems():
            if not  col_name.endswith("恶意幽默"):
                continue
            
            y_i = np.array(y_i)
            X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.3, random_state=12)
            
            print(col_name)
            clf.fit(X_train,y_train)
            train_perf = clf.score(X_train, y_train)
            cv_perf = cross_val_score(clf, X_train, y_train, cv=10).mean()

            y_hat = clf.predict(X_test)
#            print('预测结果 =', Counter(y_hat))
#            print('实际结果 =', Counter(y_test))            
            f1 = f1_score(y_test, y_hat, average='macro')

            metrics = ('accuracy', 'roc_auc_ovr') 
            scores = cross_validate(clf, X, y_i, cv=10, scoring=metrics)
            
            rsts[col_name] = {
                "train dataset": train_perf,
                "cross validation": cv_perf,
                "test dataset (f1)": f1,
                "accuracy": scores['test_accuracy'].mean(),
                "roc_auc_ovr": scores['test_roc_auc_ovr'].mean()                
            }
            
        print("- * " * 20)
        
        rsts = pd.DataFrame(rsts) * 100
        rsts = rsts.T
        display(rsts)
        
        #rsts.to_csv(f"result2210/rsts1-sjpc-model={model_name}.csv", float_format="%.4f", encoding="gbk")

In [None]:
eyym_train()

In [None]:
def njyd_train():

    models = {
        "LR": linear_model.LogisticRegression(C=0.6, solver="liblinear"),
#        "K-Neighbors": KNeighborsClassifier(n_neighbors= 3),
        "SVM": svm.SVC(C=8, probability=True),
#        "Random Forest": RandomForestClassifier(n_estimators = 4, max_depth = 5),
        "NN": MLPClassifier((256,64), max_iter=100, solver='lbfgs')
    }

    for model_name, clf in models.items():
        rsts = {}
        print("*" * 100)
        print(f'Model: {model_name}')

        for col_name, y_i in y.iteritems():
            if not  col_name.endswith("内疚诱导"):
                continue
            
            y_i = np.array(y_i)
            X_train, X_test, y_train, y_test = train_test_split(X, y_i, test_size=0.3, random_state=12)
            
            print(col_name)
            clf.fit(X_train,y_train)
            train_perf = clf.score(X_train, y_train)
            cv_perf = cross_val_score(clf, X_train, y_train, cv=10).mean()

            y_hat = clf.predict(X_test)
#            print('预测结果 =', Counter(y_hat))
#            print('实际结果 =', Counter(y_test))            
            f1 = f1_score(y_test, y_hat, average='macro')

            metrics = ('accuracy', 'roc_auc_ovr') 
            scores = cross_validate(clf, X, y_i, cv=10, scoring=metrics)
            
            rsts[col_name] = {
                "train dataset": train_perf,
                "cross validation": cv_perf,
                "test dataset (f1)": f1,
                "accuracy": scores['test_accuracy'].mean(),
                "roc_auc_ovr": scores['test_roc_auc_ovr'].mean()                
            }
            
        print("- * " * 20)
        
        rsts = pd.DataFrame(rsts) * 100
        rsts = rsts.T
        display(rsts)
        
        #rsts.to_csv(f"result2210/rsts1-sjpc-model={model_name}.csv", float_format="%.4f", encoding="gbk")

In [None]:
njyd_train() 