In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split


In [8]:
from sklearn.preprocessing import FunctionTransformer
columns = ['学号', '性别', '生源地', '总分', '幻觉、妄想症状', '自杀意图', '焦虑指标总分', '抑郁指标总分', '偏执指标总分', '自卑指标总分',
               '敏感指标总分', '社交恐惧指标总分', '躯体化指标总分', '依赖指标总分', '敌对攻击指标总分', '冲动指标总分', '强迫指标总分',
               '网络成瘾指标总分', '自伤行为指标总分', '进食问题指标总分', '睡眠困扰指标总分', '学校适应困难指标总分', '人际关系困扰指标总分',
               '学业压力指标总分', '就业压力指标总分', '恋爱困扰指标总分']
data = pd.read_csv('student_data.csv', encoding='utf-8')
data.drop(columns=columns, inplace=True)
# data.head(3)

In [9]:
data.loc[data['可能问题'] != 0, '可能问题'] = 1
data['可能问题'].value_counts()

0    106
1     72
Name: 可能问题, dtype: int64

In [10]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=8)

In [11]:
train_df['可能问题'].value_counts()

0    84
1    58
Name: 可能问题, dtype: int64

In [12]:
# Create Function Transformer to use Feature Union
def get_numeric_data(x):
    return np.array(x.iloc[:, 0:-2])


def get_text_data(x):
    return x['text'].tolist()

In [13]:
def metadata_svm_fu():

    y_train = train_df['可能问题'].tolist()
    y_test = test_df['可能问题'].tolist()

    transformer_numeric = FunctionTransformer(get_numeric_data)
    transformer_text = FunctionTransformer(get_text_data)

    # Create a pipeline to concatenate Tfidf Vector and Numeric data
    # Use SVM as classifier
    pipeline = Pipeline([
        ('metadata', FeatureUnion([
            ('numeric_feature', Pipeline([
                ('selector', transformer_numeric)
            ])),
            ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer())
            ]))
        ])),
        ('clf', SGDClassifier())
    ])

    # Grid Search Parameters for SGDClassifer
    parameters = {
        'clf__alpha': (1e-4, 1e-6),
        'metadata__text_features__vec__ngram_range': [(1, 2), (1, 3)],
        'metadata__text_features__vec__use_idf': [True, False]
    }

    # Training config
    kfold = StratifiedKFold(n_splits=5)
    scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
    refit = 'F1'

    gs_clf = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1, cv=kfold, scoring=scoring, refit=refit)
    gs_clf.fit(train_df, y_train)

    predicted = gs_clf.predict(test_df)

    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    print("precision: ", str(metrics.precision_score(y_test, predicted, average='macro')))
    print("accuracy: ", str(metrics.accuracy_score(y_test, predicted)))
    print("F1 score: ", str(metrics.f1_score(y_test, predicted, average='macro')))
    print("recall: ", str(metrics.recall_score(y_test, predicted, average='macro')))

In [14]:
metadata_svm_fu()

Fitting 5 folds for each of 8 candidates, totalling 40 fits
              precision    recall  f1-score   support

           0       0.79      0.68      0.73        22
           1       0.59      0.71      0.65        14

    accuracy                           0.69        36
   macro avg       0.69      0.70      0.69        36
weighted avg       0.71      0.69      0.70        36

[[15  7]
 [ 4 10]]
precision:  0.6888544891640866
accuracy:  0.6944444444444444
F1 score:  0.6884343036978757
recall:  0.698051948051948


In [15]:
def standard_svm():

    X_train = train_df['text'].tolist()
    X_test = test_df['text'].tolist()
    y_train = train_df['可能问题'].tolist()
    y_test = test_df['可能问题'].tolist()

    pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier()),
                         ])
    parameters = {
        'vect__ngram_range': [(1, 2), (1, 3)],
        'clf__alpha': (1e-4, 1e-6)
    }

    # Training config
    kfold = StratifiedKFold(n_splits=5)
    scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
    refit = 'F1'

    gs_clf = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1, cv=kfold, scoring=scoring, refit=refit)
    gs_clf.fit(X_train, y_train)

    predicted = gs_clf.predict(X_test)

    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    print("precision: ", str(metrics.precision_score(y_test, predicted, average='macro')))
    print("accuracy: ", str(metrics.accuracy_score(y_test, predicted)))
    print("F1 score: ", str(metrics.f1_score(y_test, predicted, average='macro')))
    print("recall: ", str(metrics.recall_score(y_test, predicted, average='macro')))

In [16]:
standard_svm()

Fitting 5 folds for each of 4 candidates, totalling 20 fits
              precision    recall  f1-score   support

           0       0.63      0.86      0.73        22
           1       0.50      0.21      0.30        14

    accuracy                           0.61        36
   macro avg       0.57      0.54      0.52        36
weighted avg       0.58      0.61      0.56        36

[[19  3]
 [11  3]]
precision:  0.5666666666666667
accuracy:  0.6111111111111112
F1 score:  0.5153846153846153
recall:  0.538961038961039
