In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

In [7]:
from sklearn.preprocessing import FunctionTransformer
columns = ['学号', '性别', '生源地', '总分', '幻觉、妄想症状', '自杀意图', '焦虑指标总分', '抑郁指标总分', '偏执指标总分', '自卑指标总分',
               '敏感指标总分', '社交恐惧指标总分', '躯体化指标总分', '依赖指标总分', '敌对攻击指标总分', '冲动指标总分', '强迫指标总分',
               '网络成瘾指标总分', '自伤行为指标总分', '进食问题指标总分', '睡眠困扰指标总分', '学校适应困难指标总分', '人际关系困扰指标总分',
               '学业压力指标总分', '就业压力指标总分', '恋爱困扰指标总分']
data = pd.read_csv('student_data.csv', encoding='utf-8')
data.drop(columns=columns, inplace=True)
data.head(3)

Unnamed: 0,幻觉、妄想症状指标标准分,自杀意图指标标准分,焦虑指标标准分,抑郁指标标准分,偏执指标标准分,自卑指标标准分,敏感指标标准分,社交恐惧指标标准分,躯体化指标标准分,依赖指标标准分,...,自伤行为指标标准分,进食问题指标标准分,睡眠困扰指标标准分,学校适应困难指标标准分,人际关系困扰指标标准分,学业压力指标标准分,就业压力指标标准分,恋爱困扰指标标准分,可能问题,text
0,2.79,2.38,0.5,3.98,1.48,3.88,2.56,4.05,1.5,3.52,...,3.04,1.65,2.33,1.56,3.28,2.22,1.89,-1.04,3,im gansu province zhangye city northwest china...
1,0.76,1.69,0.5,1.67,0.51,0.9,2.14,1.8,-0.7,0.37,...,0.17,1.65,-0.22,3.4,0.88,1.48,1.56,3.26,1,right im sitting behind computer screen wonder...
2,-0.59,1.69,-0.91,1.67,-0.94,-0.59,0.49,-0.91,0.4,-0.98,...,-0.55,-0.28,-1.06,3.4,0.4,0.75,0.25,0.87,3,look back year trampled shallowly always sever...


In [8]:
data.loc[data['可能问题'] != 0, '可能问题'] = 1
data['可能问题'].value_counts()

0    106
1     72
Name: 可能问题, dtype: int64

In [9]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=8)

In [10]:
train_df['可能问题'].value_counts()

0    84
1    58
Name: 可能问题, dtype: int64

In [11]:
# Create Function Transformer to use Feature Union
def get_numeric_data(x):
    return np.array(x.iloc[:, 0:-2])


def get_text_data(x):
    return x['text'].tolist()

In [12]:
def metadata_LR_fu():

    y_train = train_df['可能问题'].tolist()
    y_test = test_df['可能问题'].tolist()

    transformer_numeric = FunctionTransformer(get_numeric_data)
    transformer_text = FunctionTransformer(get_text_data)

    # Create a pipeline to concatenate Tfidf Vector and Numeric data
    # Use SVM as classifier
    pipeline = Pipeline([
        ('metadata', FeatureUnion([
            ('numeric_feature', Pipeline([
                ('selector', transformer_numeric)
            ])),
            ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer())
            ]))
        ])),
        ('clf', LogisticRegression(max_iter=200))
    ])

    # Grid Search Parameters for SGDClassifer
    parameters = {
        # 'clf__var_smoothing': (1e-4, 1e-6),
        'metadata__text_features__vec__ngram_range': [(1, 2), (1, 3)],
        'metadata__text_features__vec__use_idf': [True, False]
    }

    # Training config
    kfold = StratifiedKFold(n_splits=5)
    scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
    refit = 'F1'

    gs_clf = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1, cv=kfold, scoring=scoring, refit=refit)
    gs_clf.fit(train_df, y_train)

    predicted = gs_clf.predict(test_df)

    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    print("precision: ", str(metrics.precision_score(y_test, predicted, average='macro')))
    print("accuracy: ", str(metrics.accuracy_score(y_test, predicted)))
    print("F1 score: ", str(metrics.f1_score(y_test, predicted, average='macro')))
    print("recall: ", str(metrics.recall_score(y_test, predicted, average='macro')))

In [13]:
metadata_LR_fu()

Fitting 5 folds for each of 4 candidates, totalling 20 fits
              precision    recall  f1-score   support

           0       0.78      0.82      0.80        22
           1       0.69      0.64      0.67        14

    accuracy                           0.75        36
   macro avg       0.74      0.73      0.73        36
weighted avg       0.75      0.75      0.75        36

[[18  4]
 [ 5  9]]
precision:  0.7374581939799332
accuracy:  0.75
F1 score:  0.7333333333333334
recall:  0.7305194805194806


In [14]:
def standard_LR():

    X_train = train_df['text'].tolist()
    X_test = test_df['text'].tolist()
    y_train = train_df['可能问题'].tolist()
    y_test = test_df['可能问题'].tolist()

    pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LogisticRegression(max_iter=200)),
                         ])
    parameters = {
        'vect__ngram_range': [(1, 2), (1, 3)],
        # 'clf__var_smoothing': (1e-4, 1e-6)
    }

    # Training config
    kfold = StratifiedKFold(n_splits=5)
    scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
    refit = 'F1'

    gs_clf = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1, cv=kfold, scoring=scoring, refit=refit)
    gs_clf.fit(X_train, y_train)

    predicted = gs_clf.predict(X_test)

    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    print("precision: ", str(metrics.precision_score(y_test, predicted, average='macro')))
    print("accuracy: ", str(metrics.accuracy_score(y_test, predicted)))
    print("F1 score: ", str(metrics.f1_score(y_test, predicted, average='macro')))
    print("recall: ", str(metrics.recall_score(y_test, predicted, average='macro')))

In [15]:
standard_LR()



Fitting 5 folds for each of 2 candidates, totalling 10 fits
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        22
           1       0.00      0.00      0.00        14

    accuracy                           0.61        36
   macro avg       0.31      0.50      0.38        36
weighted avg       0.37      0.61      0.46        36

[[22  0]
 [14  0]]
precision:  0.3055555555555556
accuracy:  0.6111111111111112
F1 score:  0.37931034482758624
recall:  0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
