## 1.导入数据包

In [1]:
import pandas as pd
import jieba
import time
from sklearn import svm
from sklearn.externals import joblib
from numpy import * # 要用到delete操作
import numpy as np
from collections import defaultdict,Counter
import math
from gensim import corpora,models
from svmutil import *
from svm import *



## 2.读取数据集并对数据进行预处理

### 1).读取数据

In [2]:
unlabeled_data = pd.read_csv('unlabel_data.txt',names=['lines'],sep='\t',encoding='utf-8')
unlabeled_data['label'] = -1
labeled_data = pd.read_csv('label_data.txt',names=['label','lines'],sep='\t',encoding='utf-8')
frame = [labeled_data,unlabeled_data]
sum_data = pd.concat(frame,axis = 0)
sum_data = sum_data.dropna()
sum_data = sum_data.reset_index(drop = True)

### 2).对数据进行预处理

In [7]:
# 去掉文本中的空格
def process(our_data):
    m1 = map(lambda s: s.replace(' ', ''), our_data)
    return list(m1)


# 让文本只保留汉字
def is_chinese(uchar):
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

def format_str(content):
    content_str = ''
    for i in content:
        if is_chinese(i):
            content_str = content_str + ｉ
    return content_str


# 对文本进行jieba分词
def fenci(datas):
    cut_words = map(lambda s: list(jieba.cut(s)), datas)
    return list(cut_words)


# 去掉文本中的停用词
def drop_stopwords(contents, stopwords):
    contents_clean = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
        contents_clean.append(line_clean)
    return contents_clean

In [8]:
# 读取停用词表
stopwords = pd.read_csv('stopwords.txt', index_col=False, sep="\n", quoting=3, names=['stopword'], encoding='utf-8')

# 去掉文本中的空格
train_data = process(sum_data.lines.values)

# 让文本只保留汉字
chinese_list = []
for line in train_data:
    chinese_list.append(format_str(line))

# 对预处理好的文本进行分词
df_content = pd.DataFrame({'content_S': chinese_list, 'label': sum_data['label']})
content_s = fenci(df_content.content_S.values)
data_content = pd.DataFrame({'content': content_s})

# 去除停用词
contents = data_content.content.values.tolist()
stopwords = stopwords.stopword.values.tolist()
contents_clean = drop_stopwords(contents, stopwords)

# 将处理好的文本做成列表格式
df_data = pd.DataFrame({'contents_clean': contents_clean, 'label': sum_data["label"]})
word_list = list(df_data.contents_clean.values)

# 将文本处理成tfidf可训练的格式
words = []
for line_index in range(len(word_list)):
    words.append(' '.join(word_list[line_index]))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zhangyh4\AppData\Local\Temp\jieba.cache
Loading model cost 0.707 seconds.
Prefix dict has been built succesfully.


In [9]:
# 训练tfidf向量可以看912new.ipynb
class TfIdf():
    def __init__(self):
        pass

    def GetTFIDF(self,corpus,IdfPath):
        # 新的一批文本
        """
        :param corpus: 需要计算的文本
        :param IdfPath: idf文件路径
        :return:
        """
        IDF = {}
        with open(IdfPath,"r",encoding="utf-8")as f2:
            for eachline in f2:
                IDF[eachline.strip().split("\t")[0]]=[int(eachline.strip().split("\t")[1]),float(eachline.strip().split("\t")[2])]
        TFIDF = []
        for eachline in corpus:
            eachlinetfidf = {}
            eachlinetfidf_l2 = {}
            l2sum = 0
            eachlinecounts = Counter(eachline)
            for key,value in eachlinecounts.items():
                if key in IDF:
                    eachlinetfidf[IDF[key][0]] = IDF[key][1] * value
                    l2sum += eachlinetfidf[IDF[key][0]] ** 2
            l2sqrt = math.sqrt(l2sum)
            for key in eachlinetfidf:
                eachlinetfidf_l2[key] = eachlinetfidf[key]/l2sqrt
            TFIDF.append(eachlinetfidf_l2)
        return TFIDF

In [10]:
# 由此可以得到文本的tfidf向量
corpus_ = [[token for token in text.split()] for text in words]
TF = TfIdf()

tfidf_list = []
gettfidf = TF.GetTFIDF(corpus=corpus_,IdfPath="idf.txt")
for each in gettfidf:
    tfidf_list.append(each)

In [11]:
# 所有涉及到的数据
# 用于初步训练的数据
x_train = tfidf_list[:300] # 已标注数据
x_test = tfidf_list[300:] # 未标注数据
y_train = list(sum_data[:300].label.values) # 已标注数据的标签

In [None]:
# 本质思想是不断地把满足条件的未标注数据加入到已标注数据中，训练模型。然后再拿更新的模型去预测未标注的数据，如此循环往复，让模型越来越强大,
# 直到最终可以让所有未标注数据满足条件
# 未标注的数据从头到尾都不减少，只是从里面取出来而已

j = 0    #  循环迭代次数
p = 0.98 # 预测概率

# 初始化样本集为空，这里的样本集主要是用来添加到训练数据中的，起的是中间过度作用
x_train_tmp = []
y_train_tmp = []

# 预测概率>p的样本集
x_train_tmp1 = []
y_train_tmp1 = []

while len(y_train_tmp) < len(x_test):
    print("第%d次迭代" % j)
    start = time.clock()
    mdhms = time.strftime('%d%H%M%S', time.localtime(time.time()))

    x_train.extend(x_train_tmp)
    y_train.extend(y_train_tmp)

    prob = svm_problem(y_train, x_train)  # 用来训练的数据
    param = svm_parameter('-t 2 -c 1 -b 1')  # 训练参数
    model = svm_train(prob, param)  # 训练模型
    svm_save_model('svm.model' + '_' + mdhms, model)  # 保存模型
    p_label, p_acc, p_val = svm_predict(list(range(len(x_test))), x_test, model, options='-b 1')  # 可以得到预测标签以及预测的类别概率

    # 做dataframe合并标签，tfidf值，以及分类概率
    y_test = pd.DataFrame(p_label, columns=['label'])  # y_test，预测标签。全程用不到，仅仅是为了保存下来，方便后续操作
    y_probability = pd.DataFrame(p_val)                # 预测概率
    ser = pd.Series(list(x_test))                      # 未标注数据
    unlabel_ser = pd.DataFrame(ser, columns=['tfidf'])
    # 合并便于处理
    df = pd.concat([unlabel_ser, y_probability, y_test], axis=1)

    for i in range(len(df[0])):
        if df[0][i] >= p or df[1][i] >= p:
            x_train_tmp1.append(df.tfidf.values[i])
            y_train_tmp1.append(df.label.values[i])

    # 如果本次预测概率>p的未标注样本集比上一次的小而且上一次的不可以是空，则让预测概率降低
    if len(y_train_tmp1) <= len(y_train_tmp) and len(y_train_tmp) != 0:
        p = p - 0.01
    else:
        y_train_tmp = y_train_tmp1
        x_train_tmp = x_train_tmp1
    
    print("满足预测概率>p的数据量:%d" % len(x_train_tmp1))
    
    # 要重置样本集为空，下次循环时要用
    # 如果不初始化为空会导致x_train重复加入数据
    # x_train_tmp（重点在这里）
    x_train_tmp1 = []
    y_train_tmp1 = []
    
    j += 1
    end = time.clock()
    print("预测概率为:%f" % p)
    print("Running time:%s Seconds" % (end - start))
    print("*" * 70)

第0次迭代
Accuracy = 0.01% (1/10000) (classification)
满足预测概率>p的数据量:760:
预测概率为0.980000:
Running time:0.8278438786177653 Seconds
**********************************************************************
第1次迭代
Accuracy = 0.01% (1/10000) (classification)
满足预测概率>p的数据量:1381:
预测概率为0.980000:
Running time:2.2959684308713695 Seconds
**********************************************************************
第2次迭代
Accuracy = 0.01% (1/10000) (classification)
满足预测概率>p的数据量:1833:
预测概率为0.980000:
Running time:10.393089850308076 Seconds
**********************************************************************
第3次迭代
Accuracy = 0.01% (1/10000) (classification)
满足预测概率>p的数据量:6907:
预测概率为0.980000:
Running time:30.3392316117232 Seconds
**********************************************************************
第4次迭代
Accuracy = 0.01% (1/10000) (classification)
满足预测概率>p的数据量:3357:
预测概率为0.970000:
Running time:71.59408775095557 Seconds
**********************************************************************
第5次迭代
Accuracy = 0.01% (1/100