In [1]:
import random
import pandas as pd
import jieba
import time
from sklearn import svm
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib
from numpy import * # 要用到delete操作
import numpy as np
from collections import defaultdict,Counter
import math
from gensim import corpora,models
from scipy.sparse import csr_matrix
import pickle as pkl
# 不打印警告信息
import warnings
warnings.filterwarnings('ignore')



In [2]:
# 读取数据
data = pd.read_csv('xu_labeled.txt',names=['lines'],sep='\t',encoding='utf-8')
data1 = pd.read_csv('wuxu_labeled.txt',names=['lines'],sep='\t',encoding='utf-8')
frames=[data,data1]
unlabeled_data = pd.concat(frames,axis=0)
unlabeled_data['label'] = -1
labeled_data = pd.read_csv('labeled_data.txt',names=['label','lines'],sep='\t',encoding='utf-8')
frame = [labeled_data,unlabeled_data]
sum_data = pd.concat(frame,axis = 0)
sum_data = sum_data.dropna()
sum_data = sum_data.reset_index(drop = True)

In [3]:
# 去掉文本中的空格
def process(our_data):
    m1 = map(lambda s: s.replace(' ', ''), our_data)
    return list(m1)


# 让文本只保留汉字
def is_chinese(uchar):
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False


def format_str(content):
    content_str = ''
    for i in content:
        if is_chinese(i):
            content_str = content_str + ｉ
    return content_str


# 对文本进行jieba分词
def fenci(datas):
    cut_words = map(lambda s: list(jieba.cut(s)), datas)
    return list(cut_words)


# 去掉文本中的停用词
def drop_stopwords(contents, stopwords):
    contents_clean = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
        contents_clean.append(line_clean)
    return contents_clean

In [4]:
# 读取停用词表
stopwords = pd.read_csv('stopwords.txt', index_col=False, sep="\n", quoting=3, names=['stopword'], encoding='utf-8')
# 对数据进行预处理
train_data = process(sum_data.lines.values)

chinese_list = []
for line in train_data:
    chinese_list.append(format_str(line))

df_content = pd.DataFrame({'content_S': chinese_list, 'label': sum_data['label']})
content_s = fenci(df_content.content_S.values)
data_content = pd.DataFrame({'content': content_s})

contents = data_content.content.values.tolist()
stopwords = stopwords.stopword.values.tolist()
contents_clean = drop_stopwords(contents, stopwords)

df_data = pd.DataFrame({'contents_clean': contents_clean, 'label': sum_data["label"]})
word_list = list(df_data.contents_clean.values)

# 将文本处理成tfidf可训练的格式
words = []
for line_index in range(len(word_list)):
    words.append(' '.join(word_list[line_index]))

word_list = []
for i in range(len(words)):
    word_list.append(words[i].split(' '))


dictionary = corpora.Dictionary(word_list)
new_corpus = [dictionary.doc2bow(text) for text in word_list]
tfidf = models.TfidfModel(new_corpus)

tfidf_vec = []
for i in range(len(words)):
    string = words[i]
    string_bow = dictionary.doc2bow(string.split())
    string_tfidf = tfidf[string_bow]
    tfidf_vec.append(string_tfidf)

lsi_model = models.LsiModel(corpus = tfidf_vec,id2word = dictionary,num_topics=30)

lsi_vec = []
for i in range(len(words)):
    string = words[i]
    string_bow = dictionary.doc2bow(string.split())
    string_lsi = lsi_model[string_bow]
    lsi_vec.append(string_lsi)


data = []
rows = []
cols = []
line_count = 0
for line in lsi_vec:
    for elem in line:
        rows.append(line_count)
        cols.append(elem[0])
        data.append(elem[1])
    line_count += 1
lsi_sparse_matrix = csr_matrix((data,(rows,cols))) # 稀疏向量
lsi_matrix = lsi_sparse_matrix.toarray() # 密集向量


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zhangyh4\AppData\Local\Temp\jieba.cache
Loading model cost 0.731 seconds.
Prefix dict has been built succesfully.


In [7]:
x_train = list(lsi_matrix[:29748]) # 已标注数据
x_test = list(lsi_matrix[29748:]) # 未标注数据
x_test_ = x_test
y_train = list(sum_data[:29748].label.values) # 已标注数据标签
y_test = list(sum_data[29748:].label.values)
x_test_content = list(sum_data[29748:].lines.values) 

In [10]:
x_train1 = x_train[:9916]
y_train1 = y_train[:9916]

x_train2 = x_train[9917:19832]
y_train2 = y_train[9917:19832]

x_train3 = x_train[19833:]
y_train3 = y_train[19833:]

In [11]:
# 训练模型
def train_model(x,y,choose_model):
    if choose_model == 'svm':
        clf = svm.SVC(probability=True)
        clf.fit(x,y)
        mdhms = time.strftime('%d%H%M', time.localtime(time.time()))
        file = r'C:\Users\zhangyh4\Desktop\xietong1012\model\svm.joblib' + '_' + mdhms
        joblib.dump(clf,file)
        svm_model = joblib.load(file)
        return svm_model
    elif choose_model == 'xgboost':
        xgbc = XGBClassifier()
        xgbc.fit(x,y)
        mdhms = time.strftime('%d%H%M', time.localtime(time.time()))
        file = r'C:\Users\zhangyh4\Desktop\xietong1012\model\xgboost.joblib' + '_' + mdhms
        joblib.dump(xgbc,file)
        xgboost_model = joblib.load(file)
        return xgboost_model
    elif choose_model == 'bayes':
        # bayesc = MultinomialNB() 因为向量里不能有负数，一旦有此方法就不行
        # bayesc.fit(x,y)
        gnb = GaussianNB()
        gnb.fit(x,y)
        mdhms = time.strftime('%d%H%M', time.localtime(time.time()))
        file = r'C:\Users\zhangyh4\Desktop\xietong1012\model\bayes.joblib' + '_' + mdhms
        joblib.dump(gnb,file)
        bayes_model = joblib.load(file)
        return bayes_model

In [12]:
svm_model = train_model(np.array(x_train1), y_train1, 'svm')
xgboost_model = train_model(np.array(x_train2), y_train2, 'xgboost')
bayes_model = train_model(np.array(x_train3), y_train3, 'bayes')

In [None]:

for i in range(len(x_test)):
    y_svm_pred = svm_model.predict([x_test[i]])
    y_xgboost_pred = xgboost_model.predict([x_test[i]])
    y_bayes_pred = bayes_model.predict([x_test[i]])
    if y_svm_pred[0] == y_xgboost_pred[0] and y_svm_pred[0] == y_bayes_pred[0]:
        x_train.append(x_test[i])
        y_train.append(y_svm_pred[0])

    elif y_svm_pred[0] == y_xgboost_pred[0] and y_svm_pred[0] != y_bayes_pred[0]:
        x_train3.append(x_test[i])
        y_train3.append(y_svm_pred[0])
        bayes_model = train_model(np.array(x_train3), y_train3, 'bayes')
        y_bayes_pred = bayes_model.predict([x_test[i]])
        x_train.append(x_test[i])
        y_train.append(y_svm_pred[0])

    elif y_svm_pred[0] == y_bayes_pred[0] and y_svm_pred[0] != y_xgboost_pred[0]:
        x_train2.append(x_test[i])
        y_train2.append(y_svm_pred[0])
        xgboost_model = train_model(np.array(x_train2), y_train2, 'xgboost')
        y_xgboost_pred = xgboost_model.predict([x_test[i]])
        x_train.append(x_test[i])
        y_train.append(y_svm_pred[0])

    elif y_xgboost_pred[0] == y_bayes_pred[0] and y_svm_pred[0] != y_xgboost_pred[0]:
        x_train1.append(x_test[i])
        y_train1.append(y_svm_pred[0])
        svm_model = train_model(np.array(x_train1), y_train1, 'svm')
        y_svm_pred = svm_model.predict([x_test[i]])
        x_train.append(x_test[i])
        y_train.append(y_svm_pred[0]) 
            
        