In [1]:
# coding:utf-8
import gensim
import numpy as np
from sklearn.externals import joblib


class Anti_SEO_Solver:
    
    def __init__(self, cut_engine, word2vec_path, LR_model_path, user_dict_path, stopwords_path,
                 positions_labeled_path):
        # 加载word2vec模型
        model = gensim.models.Word2Vec.load(word2vec_path)
        self.wv = model.wv
        # 加载逻辑回归模型
        classifier = joblib.load(LR_model_path)
        self.classifier = classifier
        # 加载自定义字典
        cut_engine.load_userdict(user_dict_path)
        self.cut_engine = cut_engine
        # 加载停止词库
        stopwords = [line.strip() for line in open(stopwords_path, 'r', encoding='utf-8').readlines()]
        self.stopwords = stopwords
        # 加载职位标签字典
        positions_labeled = [[line.strip().split(" ")[0], int(line.strip().split(" ")[1])] for line in
                             open(positions_labeled_path, 'r', encoding='utf-8').readlines()]
        positions_labeled_dict = dict(positions_labeled)
        self.positions_labeled_dict = positions_labeled_dict

    # 对句子进行分词函数
    def seg_sentence(self, sen):
        sentence_seged = self.cut_engine.cut(sen.lower().strip())
        seg_list = []
        for word in sentence_seged:
            if word not in self.stopwords:
                seg_list.append(word)
        return seg_list

    # sen2vec函数
    def sentence2vec(self, sen):
        sen_lis = self.seg_sentence(sen)
        res = np.zeros([150], dtype=np.float32)
        for word in sen_lis:
            if word in self.wv:
                res += self.wv[word]
        return res

    # LR 预测函数
    def predict(self, sen):
        sen_vec = self.sentence2vec(sen).reshape(1, 150)
        return self.classifier.predict(sen_vec)[0]

    # LR 批量预测一个文件
    def solveSingleFile_LR(self, inputFile, outputFile1, outputFile0):
        fout1 = open(outputFile1, "w", encoding="utf-8")
        fout0 = open(outputFile0, "w", encoding="utf-8")
        with open(inputFile, "rb") as fin:
            line = fin.readline().decode('utf-8').lower().strip()
            while (line):
                res = self.predict(line)
                if (res == 1):
                    fout1.write(line)
                    fout1.write("\n")
                if (res == 0):
                    fout0.write(line)
                    fout0.write("\n")
                line = fin.readline().decode('utf-8').lower().strip()

    # 类别匹配得分函数，分数越大，属于标题SEO优化越大
    def getScore(self, sen_list, use_SEO_algorithm):
        if (use_SEO_algorithm):
            tmp = set()
            for word in sen_list:
                if word in self.positions_labeled_dict:
                    tmp.add(self.positions_labeled_dict[word])
            return len(tmp)
        else:
            count = 0
            for word in sen_list:
                if word in self.positions_labeled_dict:
                    count += 1
            return count

    # 类别匹配:如果是SEO标题返回1，否则返回0
    def cat_match(self, sen, threshold=4, use_SEO_algorithm=True):
        sen_lis = self.seg_sentence(sen)
        score = self.getScore(sen_lis, use_SEO_algorithm)
        if (score >= threshold):
            return 1
        else:
            return 0

    # CM 批量预测一个文件
    def solveSingleFile_CM(self, inputFile, outputFile1, outputFile0, threshold=4, use_SEO_algorithm=True):
        fout1 = open(outputFile1, "w", encoding="utf-8")
        fout0 = open(outputFile0, "w", encoding="utf-8")
        with open(inputFile, "rb") as fin:
            line = fin.readline().decode('utf-8').lower().strip()
            while (line):
                res = self.cat_match(line, threshold, use_SEO_algorithm)
                if (res == 1):
                    fout1.write(line)
                    fout1.write("\n")
                if (res == 0):
                    fout0.write(line)
                    fout0.write("\n")
                line = fin.readline().decode('utf-8').lower().strip()
                
    # 逻辑回归与类别的匹配结合预测
    def LR_CM_predict(self, sen, threshold=4, use_SEO_algorithm=True):
        res1 = self.predict(sen)
        res2 = self.cat_match(sen, threshold, use_SEO_algorithm)
        if (res1 == 1 and res2 == 1):
            return 1
        else:
            return 0



In [2]:
LR_model_path = "./LR_model/lr_model_final2.m"
classifier = joblib.load(LR_model_path)

In [32]:
classifier.intercept_

array([-16.19408304])

In [18]:
for i in classifier.coef_.tolist()[0]:
    print(i)

0.18914294164127704
-0.3540008685139628
0.1403591918541356
-0.4448947789847297
0.2016996988017686
-0.2785757331543525
0.24185150767915917
0.4528646159572033
0.2763148415193548
-0.1248930121543194
-0.5033307121625525
0.19105251903648385
-0.06340011237558718
0.39505040027007843
-0.17258503207182455
0.09752787173887507
0.2132629886716583
-0.3270814328225123
0.46050004543852546
0.11493430214883116
0.11431854597897452
-0.2669968051867129
-0.3789089665133073
-0.09482107432244062
-0.033408071559430805
0.08816457924743942
0.42801362590259723
0.09803807519556629
-0.03990949451879792
0.08138282622788894
-0.16293215718829598
-0.17871011331072148
-0.1954268383152429
-0.0832857324814147
-0.3006536023603757
-0.27977982936608237
0.24314156940960785
-0.15727046569501832
-0.010505341932125486
0.10588844068328154
0.5840723590802936
0.3235768436072992
0.028001652212946464
-0.39241236799888163
-0.2912241105760716
0.5964668878085455
0.44834118298999737
-0.15908220779853288
-0.12153539614383305
-0.218607672

In [20]:
import jieba
word2vec_path = "./model/job_title_word2vec.model"
LR_model_path = "./LR_model/lr_model_final2.m"
user_dict_path = "./my_dict/zhilian.dict"
stopwords_path = "./stopwords/stopwords.txt"
positions_labeled_path = "./positions_kmeans_result/positions_labeled_115_0928.txt"
anti_seo_solver = Anti_SEO_Solver(jieba, word2vec_path, LR_model_path, user_dict_path, stopwords_path, positions_labeled_path)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 4.395 seconds.
Prefix dict has been built succesfully.


In [34]:
inputFile = "./data/titles_test_1.txt"
outputFile1 = "./data/titles_test_1_1.txt"
outputFile0 = "./data/titles_test_1_0.txt"
anti_seo_solver.solveSingleFile_LR(inputFile, outputFile1, outputFile0)

In [3]:
anti_seo_solver.LR_CM_predict("物业保安班长/保安队长/安保主管 ")

1