In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

In [2]:
from sklearn.preprocessing import FunctionTransformer
columns = ['学号', '性别', '生源地', '总分', '幻觉、妄想症状', '自杀意图', '焦虑指标总分', '抑郁指标总分', '偏执指标总分', '自卑指标总分',
               '敏感指标总分', '社交恐惧指标总分', '躯体化指标总分', '依赖指标总分', '敌对攻击指标总分', '冲动指标总分', '强迫指标总分',
               '网络成瘾指标总分', '自伤行为指标总分', '进食问题指标总分', '睡眠困扰指标总分', '学校适应困难指标总分', '人际关系困扰指标总分',
               '学业压力指标总分', '就业压力指标总分', '恋爱困扰指标总分']
data = pd.read_csv('student_data.csv', encoding='utf-8')
print('dataframe shape before drop: ', data.shape)
data.drop(columns=columns, inplace=True)
print('dataframe shape after drop: ', data.shape)
train_df, test_df = train_test_split(data, test_size=0.3, random_state=8)

dataframe shape before drop:  (178, 50)
dataframe shape after drop:  (178, 24)


In [3]:
train_df['可能问题'].value_counts()

0    75
1    27
3    11
2    11
Name: 可能问题, dtype: int64

In [4]:
train_class_0 = train_df[train_df['可能问题']==0]
train_class_1 = train_df[train_df['可能问题']==1]
train_class_2 = train_df[train_df['可能问题']==2]
train_class_3 = train_df[train_df['可能问题']==3]

In [5]:
from textaugment import EDA

In [6]:
t = EDA(random_state=8)
new_text_1 = [t.random_swap(t.random_insertion(t.random_deletion(t.synonym_replacement(sent), p=0.4))) for sent in train_class_1['text']]
new_text_2 = [t.random_swap(t.random_insertion(t.random_deletion(t.synonym_replacement(sent), p=0.4))) for sent in train_class_2['text']]
new_text_3 = [t.random_swap(t.random_insertion(t.random_deletion(t.synonym_replacement(sent), p=0.4))) for sent in train_class_3['text']]

In [7]:
new_class_1 = train_class_1
new_class_1['text'] = new_text_1
train_1 = pd.concat([train_class_1, new_class_1])

new_class_2 = train_class_2
new_class_2['text'] = new_text_2
train_2 = pd.concat([train_class_2, new_class_2])

new_class_3 = train_class_3
new_class_3['text'] = new_text_3
train_3 = pd.concat([train_class_3, new_class_3])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [8]:
train = pd.concat([train_class_0, train_1, train_2, train_3])
train['可能问题'].value_counts()

0    75
1    54
3    22
2    22
Name: 可能问题, dtype: int64

In [9]:
print(train.columns)
print(train.shape)

Index(['幻觉、妄想症状指标标准分', '自杀意图指标标准分', '焦虑指标标准分', '抑郁指标标准分', '偏执指标标准分', '自卑指标标准分',
       '敏感指标标准分', '社交恐惧指标标准分', '躯体化指标标准分', '依赖指标标准分', '敌对攻击指标标准分', '冲动指标标准分',
       '强迫指标标准分', '网络成瘾指标标准分', '自伤行为指标标准分', '进食问题指标标准分', '睡眠困扰指标标准分',
       '学校适应困难指标标准分', '人际关系困扰指标标准分', '学业压力指标标准分', '就业压力指标标准分', '恋爱困扰指标标准分',
       '可能问题', 'text'],
      dtype='object')
(173, 24)


In [10]:
# Create Function Transformer to use Feature Union
def get_numeric_data(x):
    return np.array(x.iloc[:, 0:-2])


def get_text_data(x):
    return x['text'].tolist()

In [11]:
def metadata_svm_fu():

    y_train = train['可能问题'].tolist()
    y_test = test_df['可能问题'].tolist()

    transformer_numeric = FunctionTransformer(get_numeric_data)
    transformer_text = FunctionTransformer(get_text_data)

    # Create a pipeline to concatenate Tfidf Vector and Numeric data
    # Use SVM as classifier
    pipeline = Pipeline([
        ('metadata', FeatureUnion([
            ('numeric_feature', Pipeline([
                ('selector', transformer_numeric)
            ])),
            ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer())
            ]))
        ])),
        ('clf', SGDClassifier())
    ])

    # Grid Search Parameters for SGDClassifer
    parameters = {
        'clf__alpha': (1e-4, 1e-6),
        'metadata__text_features__vec__ngram_range': [(1, 2), (1, 3)],
        'metadata__text_features__vec__use_idf': [True, False]
    }

    # Training config
    kfold = StratifiedKFold(n_splits=5)
    scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
    refit = 'F1'

    gs_clf = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1, cv=kfold, scoring=scoring, refit=refit)
    gs_clf.fit(train, y_train)

    predicted = gs_clf.predict(test_df)

    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    print("precision: ", str(metrics.precision_score(y_test, predicted, average='macro')))
    print("accuracy: ", str(metrics.accuracy_score(y_test, predicted)))
    print("F1 score: ", str(metrics.f1_score(y_test, predicted, average='macro')))
    print("recall: ", str(metrics.recall_score(y_test, predicted, average='macro')))

In [12]:
metadata_svm_fu()

Fitting 5 folds for each of 8 candidates, totalling 40 fits
              precision    recall  f1-score   support

           0       0.80      0.77      0.79        31
           1       0.36      0.24      0.29        17
           2       0.14      0.25      0.18         4
           3       0.17      0.50      0.25         2

    accuracy                           0.56        54
   macro avg       0.37      0.44      0.38        54
weighted avg       0.59      0.56      0.56        54

[[24  4  3  0]
 [ 6  4  2  5]
 [ 0  3  1  0]
 [ 0  0  1  1]]
precision:  0.3682900432900433
accuracy:  0.5555555555555556
F1 score:  0.3761044283585267
recall:  0.4398719165085389


In [13]:
def standard_svm():

    X_train = train['text'].tolist()
    X_test = test_df['text'].tolist()
    y_train = train['可能问题'].tolist()
    y_test = test_df['可能问题'].tolist()

    pipeline = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier()),
                         ])
    parameters = {
        'vect__ngram_range': [(1, 2), (1, 3)],
        'clf__alpha': (1e-4, 1e-6)
    }

    # Training config
    kfold = StratifiedKFold(n_splits=5)
    scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
    refit = 'F1'

    gs_clf = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1, cv=kfold, scoring=scoring, refit=refit)
    gs_clf.fit(X_train, y_train)

    predicted = gs_clf.predict(X_test)

    print(metrics.classification_report(y_test, predicted))
    print(metrics.confusion_matrix(y_test, predicted))
    print("precision: ", str(metrics.precision_score(y_test, predicted, average='macro')))
    print("accuracy: ", str(metrics.accuracy_score(y_test, predicted)))
    print("F1 score: ", str(metrics.f1_score(y_test, predicted, average='macro')))
    print("recall: ", str(metrics.recall_score(y_test, predicted, average='macro')))

In [14]:
standard_svm()

Fitting 5 folds for each of 4 candidates, totalling 20 fits
              precision    recall  f1-score   support

           0       0.57      1.00      0.73        31
           1       0.00      0.00      0.00        17
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         2

    accuracy                           0.57        54
   macro avg       0.14      0.25      0.18        54
weighted avg       0.33      0.57      0.42        54

[[31  0  0  0]
 [17  0  0  0]
 [ 4  0  0  0]
 [ 2  0  0  0]]
precision:  0.14351851851851852
accuracy:  0.5740740740740741
F1 score:  0.1823529411764706
recall:  0.25


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
