In [1]:
import pandas as pd
import numpy as np
from collections import Counter

classify_data = pd.read_csv('data/rumor_classification_data_all.csv')
Counter(classify_data['label'])

Counter({1: 1397, 0: 3513})

In [5]:
# 随机过采样
from imblearn.over_sampling import RandomOverSampler as ros

ros = ros(random_state = 20191109)
X_resample, y_resmaple = ros.fit_resample(classify_data.iloc[:,:-1], classify_data.iloc[:,-1])
Counter(y_resmaple)

Counter({1: 3513, 0: 3513})

In [4]:
data_res = pd.DataFrame(X_resample, columns=['text'])
data_res['label'] = y_resmaple
data_res.to_csv('data/resample_data.csv', index=False)
data_res.shape

(7026, 2)

In [6]:
import re
import jieba

def seg_sentence(s):
    sent = ''
    s = s.strip()  #去前后的空格
    s = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", " ", s) #去标点符号
    words = jieba.cut(s)
    sent = ' '.join(words)
    return sent
data_res['text'] = data_res['text'].apply(seg_sentence)
data_res.head()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ChuAI\AppData\Local\Temp\jieba.cache
Loading model cost 0.773 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,text,label
0,人心 可谓 啊 那群 可怜 的 孩子 难道 电视台 报社 和 杂志社 的 人 都...,1
1,深圳 的 朋友 转 一下 朋友 早上 在 深圳 会展中心 捡 到 一个 钱包 里...,1
2,寻人启事 有 线索 酬金 万 帮忙 扩散 今天上午 三岁 多 小女孩 在 洛...,1
3,朋友 在 金砂 乡 东门 步行街 捡 到 一个 钱包 里面 有 身份证 徐俊 峰 的 ...,1
4,昨天 邵逸夫 先生 出殡 了 邵逸夫 去世 了 却 没有 看见 有人 悼念 我们...,1


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(data_res['text']).toarray()
y = y_resmaple
features = tfidf.get_feature_names()
TFIDF = pd.DataFrame(X, columns=features)
TFIDF['label'] = y_resmaple
TFIDF.to_csv('data/TFIDF.csv', index=False)
TFIDF.head()

Unnamed: 0,一一,一万,一万元,一万块,一万多,一万多元,一万头,一三一素,一下,一下张,...,龙津,龙港,龙潭,龙琦,龙脉,龙裔,龙须,龙鳗,龚太宏,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0572,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108999,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


# 卡方检验和互信息

In [6]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif

sk_chi2 = SelectKBest(chi2, k=100)
X_new = sk_chi2.fit_transform(X, y)

sk_info = SelectKBest(mutual_info_classif, k=100)
X_new = sk_info.fit_transform(X, y)

word_importance = pd.DataFrame(sk_chi2.scores_, index = features, columns=['chi2'])
word_importance['mutual_info_classif'] = sk_info.scores_
word_importance.head()

Unnamed: 0,chi2,mutual_info_classif
一一,0.208995,0.0
一万,2.045243,0.001019
一万元,0.349494,0.006778
一万块,0.698867,0.004225
一万多,0.158425,0.0


# AdaBoost & ET & SVM & logisticRegression 

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection

ada_est =AdaBoostClassifier(random_state=0)
ada_param_grid = {'n_estimators': [100], 'learning_rate': [0.01, 0.1]}
ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, cv=5, verbose=1)
ada_grid.fit(X, y)
word_importance['ada'] = ada_grid.best_estimator_.feature_importances_

et_est = ExtraTreesClassifier(random_state=0)
et_param_grid = {'n_estimators': [100], 'min_samples_split': [3, 4], 'max_depth': [20]}
et_grid = model_selection.GridSearchCV(et_est, et_param_grid, cv=5, verbose=1)
et_grid.fit(X, y)
word_importance['et'] = et_grid.best_estimator_.feature_importances_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 30.5min finished


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.4min finished


In [15]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state = 20191109)
classify_data_gen = kf.split(X,y)
print(classify_data_gen)

<generator object _BaseKFold.split at 0x00000243DA7432A0>


In [16]:
from sklearn.metrics import accuracy_score, classification_report, recall_score, f1_score
from sklearn.svm import SVC

i = 1
for train_idx, test_idx in classify_data_gen:
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    y_test = y[test_idx]
    svm = SVC(kernel='linear', C=0.1, random_state=20191109)
    svm.fit(X_train, y_train)
    y_predTrain = svm.predict(X_train)
    y_predTest = svm.predict(X_test)
    name = 'svm_K' + str(i)
    word_importance[name] = svm.coef_[0]
    print(accuracy_score(y_train, y_predTrain))
    print(accuracy_score(y_test, y_predTest))
    i += 1

0.7912811387900356
0.7510668563300142
0.7927415050702722
0.7330960854092526
0.8164027753068849
0.7729537366548043
0.7900729407578723
0.7430604982206406
0.788827610745419
0.7444839857651245


In [20]:
classify_data_gen_ = kf.split(X,y)

In [21]:
from sklearn.linear_model import LogisticRegression

i = 1
for train_idx, test_idx in classify_data_gen_:
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    y_test = y[test_idx]
    lr = LogisticRegression(solver='lbfgs', random_state=20191109)
    lr.fit(X_train, y_train)
    y_predTrain = lr.predict(X_train)
    y_predTest = lr.predict(X_test)
    name = 'lr_K' + str(i)
    word_importance[name] = lr.coef_[0]
    print(accuracy_score(y_train, y_predTrain))
    print(accuracy_score(y_test, y_predTest))
    i += 1

0.8793594306049822
0.813655761024182
0.8745774773172034
0.8078291814946619
0.8809820316669632
0.805693950177936
0.87439957302971
0.8185053380782918
0.8722647215797901
0.8170818505338078


In [22]:
word_importance

Unnamed: 0,chi2,mutual_info_classif,ada,et,svm_K1,svm_K2,svm_K3,svm_K4,svm_K5,lr_K1,lr_K2,lr_K3,lr_K4,lr_K5
一一,0.208995,0.000000,0.00,0.000000e+00,-0.020900,-0.020900,0.000000,-0.020900,-0.020900,-0.081510,-0.080909,0.000000,-0.087646,-0.083537
一万,2.045243,0.001019,0.00,5.509131e-04,0.214471,0.170983,0.212900,0.233737,0.218384,0.368991,0.288750,0.406595,0.446238,0.363076
一万元,0.349494,0.006778,0.00,9.009835e-06,0.091493,0.036110,0.077485,0.036433,0.124452,0.197671,0.060267,0.139971,0.072177,0.391100
一万块,0.698867,0.004225,0.00,0.000000e+00,0.052324,0.069887,0.052324,0.052506,0.052506,0.185331,0.214824,0.185927,0.166731,0.180322
一万多,0.158425,0.000000,0.00,0.000000e+00,-0.015843,0.000000,-0.015843,-0.015843,-0.015843,-0.040881,0.000000,-0.041339,-0.043804,-0.043762
一万多元,0.291813,0.006620,0.00,9.906511e-07,0.051167,0.050228,0.033054,0.033232,0.033232,0.181234,0.091112,0.068567,0.040479,0.067541
一万头,0.000553,0.005263,0.00,0.000000e+00,0.050763,0.002342,-0.048421,0.002342,0.002342,0.200217,0.029250,-0.165316,0.030445,0.039127
一三一素,0.925324,0.000100,0.00,5.813590e-08,-0.046072,-0.044785,-0.047933,-0.034122,-0.042478,-0.074405,-0.079323,-0.044509,-0.058743,-0.056678
一下,0.249098,0.045877,0.01,2.161760e-03,0.331569,0.566090,0.403325,0.559378,0.569148,0.778742,1.316866,1.108401,1.246098,0.972006
一下张,0.219639,0.000000,0.00,0.000000e+00,0.000000,-0.021964,-0.021964,-0.021964,-0.021964,0.000000,-0.086309,-0.090167,-0.092672,-0.076788


In [23]:
word_importance.to_csv('data/word_importance.csv')

In [26]:
data = pd.DataFrame(X_resample, columns=['text'])
data.head()

Unnamed: 0,text
0,人心 可谓 啊 那群 可怜 的 孩子 难道 电视台 报社 和 杂志社 的 人 都...
1,深圳 的 朋友 转 一下 朋友 早上 在 深圳 会展中心 捡 到 一个 钱包 里...
2,寻人启事 有 线索 酬金 万 帮忙 扩散 今天上午 三岁 多 小女孩 在 洛...
3,朋友 在 金砂 乡 东门 步行街 捡 到 一个 钱包 里面 有 身份证 徐俊 峰 的 ...
4,昨天 邵逸夫 先生 出殡 了 邵逸夫 去世 了 却 没有 看见 有人 悼念 我们...
