In [1]:
#coding:utf-8
import jieba

# 创建停用词list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子进行分词
def seg_sentence(sentence, cut_engine, stopwords):
    sentence_seged = cut_engine.cut(sentence.strip())
    seg_list = []
    for word in sentence_seged:
        if word not in stopwords:
            seg_list.append(word)
    return seg_list

# 得分函数，分数越大，属于标题SEO优化越大
def getScore(sen_list, positions_labeled_dict, use_SEO_algorithm):
    if(use_SEO_algorithm):        
        tmp = set()
        for word in sen_list:
            if word in positions_labeled_dict:
                tmp.add(positions_labeled_dict[word])
        return len(tmp)
    else:
        count = 0
        for word in sen_list:
            if word in positions_labeled_dict:
                count += 1
        return count

# 如果是SEO标题返回1，否则返回0
def anti_SEO_algorithm(sentence, threshold, cut_engine, stopwords, positions_labeled_dict, use_SEO_algorithm=True):
    # 分词
    sen_lis = seg_sentence(sentence.lower(), cut_engine, stopwords)
    score = getScore(sen_lis, positions_labeled_dict, use_SEO_algorithm)
    if(score >= threshold):
        return 1
    else:
        return 0
    
def combination_strategy_SEO_title_recognition(sentence, threshold1, threshold2, cut_engine, stopwords, positions_labeled_dict):
    # 分词
    sen_list = seg_sentence(sentence.lower(), cut_engine, stopwords)
    tmp = set()
    count2 = 0
    for word in sen_list:
        if word in positions_labeled_dict:
            count2 += 1
            tmp.add(positions_labeled_dict[word])
    count1 = len(tmp)
    if(count1 >= threshold1):
        return 1
    elif(count2 >= threshold2):
        return 1
    else:
        return 0    
    
def solveSingleFile(inputFile, outputFile1, outputFile0, cut_engine, stopwords, positions_labeled_dict, threshold, use_SEO_algorithm=True):
    fout1 = open(outputFile1, "w", encoding="utf-8")
    fout0 = open(outputFile0, "w", encoding="utf-8")
    with open(inputFile,"rb") as fin:
        line = fin.readline().decode('utf-8').lower().strip()
        while(line):
            # sentence = line.split(sep="\u0001")[1]
            res = anti_SEO_algorithm(line, threshold, cut_engine, stopwords, positions_labeled_dict, use_SEO_algorithm)
            if(res == 1):
                fout1.write(line)
                fout1.write("\n")
            if(res == 0):
                fout0.write(line)
                fout0.write("\n")
            line = fin.readline().decode('utf-8').lower().strip()

In [2]:
# 加载分词用的外部字典
zhiliandict = "./my_dict/zhilian.dict"
jieba.load_userdict(zhiliandict)
stopwords = stopwordslist('./stopwords/stopwords.txt')  # 这里加载停用词的路径

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 2.597 seconds.
Prefix dict has been built succesfully.


In [7]:
# 分词测试
sen = "前端/PHP/IOS/JAVA/UI/VC++/测试/安卓".lower()
sen_lis = seg_sentence(sen, jieba, stopwords)
sen_lis

['前端', 'php', 'ios', 'java', 'ui', 'vc++', '测试', '安卓']

In [3]:
# 逻辑回归
# 01 加载词向量字典
import gensim
model_path = "./model/job_title_word2vec.model"
model = gensim.models.Word2Vec.load(model_path)



In [20]:
import numpy as np
# sen2vec函数
def sen2vec(sen, cut_engine, stopwords, word_vec):
    sen_lis = seg_sentence(sen, cut_engine, stopwords)
    res = np.zeros([150], dtype=np.float32)
    for word in sen_lis:
        if word in word_vec:
            res += word_vec[word]
    return res

In [21]:
# 构建训练集
POS_path = './data/POS.txt'
NEG_path = './data/NEG.txt'
with open(POS_path, encoding='utf-8') as POS_in:
    POS_lis = [sen2vec(line.strip(), jieba, stopwords, model.wv) for line in POS_in]
with open(NEG_path, encoding='utf-8') as NEG_in:
    NEG_lis = [sen2vec(line.strip(), jieba, stopwords, model.wv) for line in NEG_in]
y_POS = [1] * len(POS_lis)
y_NEG = [0] * len(NEG_lis)
X_lis = POS_lis + NEG_lis
y_lis = y_POS + y_NEG
X_train = np.array(X_lis, dtype=np.float32)
y_train = np.array(y_lis, dtype=np.float32)

In [22]:
# 打乱训练集顺序
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=0)

In [23]:
# 精确率、召回率、F1
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
classifier = LogisticRegression(class_weight='balanced')
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [28]:
# 结果评价
precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision') 
print('精确率：', np.mean(precisions), precisions)
recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall')
print('召回率：', np.mean(recalls), recalls)
f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1')
print('综合评价指标：', np.mean(f1s), f1s)

精确率： 0.647606625335 [ 0.64868701  0.63964912  0.65509761  0.64415215  0.65044723]
召回率： 0.977951899217 [ 0.9827957   0.98010753  0.97419355  0.97471759  0.97794513]
综合评价指标： 0.779194955631 [ 0.78153057  0.77409766  0.78339818  0.77568493  0.78126343]


In [29]:
# 保存LR训练好的模型
from sklearn.externals import joblib
joblib.dump(classifier, "./LR_model/lr_model_final2.m")

['./LR_model/lr_model_final2.m']

In [25]:
# LR 预测函数
def LR_predict(sen, lr_model, cut_engine, stopwords, word_vec):
    sen_vec = sen2vec(sen, cut_engine, stopwords, word_vec).reshape(1,150)
    return lr_model.predict(sen_vec)[0]

In [26]:
def solveSingleFile_LR(inputFile, outputFile1, outputFile0, lr_model, cut_engine, stopwords, word_vec):
    fout1 = open(outputFile1, "w", encoding="utf-8")
    fout0 = open(outputFile0, "w", encoding="utf-8")
    with open(inputFile,"rb") as fin:
        line = fin.readline().decode('utf-8').lower().strip()
        while(line):
            res = LR_predict(line, lr_model, cut_engine, stopwords, word_vec)
            if(res == 1):
                fout1.write(line)
                fout1.write("\n")
            if(res == 0):
                fout0.write(line)
                fout0.write("\n")
            line = fin.readline().decode('utf-8').lower().strip()

In [30]:
inputFile = "./data/NEG_test.txt"
outputFile1 = "./data/NEG_test_1_new.txt"
outputFile0 = "./data/NEG_test_0_new.txt"
solveSingleFile_LR(inputFile,outputFile1,outputFile0,classifier,jieba,stopwords,model.wv)

In [31]:
positions_labeled = [[line.strip().split(" ")[0], int(line.strip().split(" ")[1])] for line in open("./positions_kmeans_result/positions_labeled_115_0926.txt", 'r', encoding='utf-8').readlines()]
positions_labeled_dict = dict(positions_labeled) 

In [19]:
len(positions_labeled_dict)

2109

In [84]:
sen1 = "前端/PHP/IOS/JAVA/UI/VC++/测试/安卓"
sen2 = "高级Java开发工程师（中关村"
sen = "开发2部 Java编程实习生+五险一金+包住宿"
combination_strategy_SEO_title_recognition(sen, 3, 4, jieba, stopwords, positions_labeled_dict)

1

In [12]:
java_title_list = [line.strip() for line in open("./job_title/000001_0", 'r', encoding='utf-8').readlines()]
fout1 = open("./job_title/000001_1", "w", encoding="utf-8")
fout0 = open("./job_title/000001_0", "w", encoding="utf-8")
for line in java_title_list:
    res = combination_strategy_SEO_title_recognition(line, 3, 5, jieba, stopwords, positions_labeled_dict)
    if(res == 1):
        fout1.write(line)
        fout1.write("\n")
    if(res == 0):
        fout0.write(line)
        fout0.write("\n")

In [77]:
java_title_list

['\ufeffJAVA讲师',
 'Java开发工程师',
 '前端/PHP/IOS/JAVA/UI/VC++/测试/安卓',
 'PHP中级工程师',
 'Java工程师',
 'Java研发工程师',
 'Java开发工程师（实习生',
 '高级Java开发工程师（中关村）',
 '高级java工程师（技术中心）',
 '开发2部 Java编程实习生+五险一金+包住宿',
 'Java开发工程师包住宿',
 '研发部二 招聘Java初级工程师',
 '研发三部高薪Java开发(无经验/转行均可）',
 '高级JAVA开发工程师',
 '中级/高级Java工程师',
 'java开发工程师',
 'java开发工程师',
 'java开发工程师',
 '开发2部170 java软件开发实习生高薪定岗',
 '高级java开发工程师',
 'JAVA高级开发工程师（云平台开发）',
 'JAVA开发工程师',
 'java代码工程师（信息安全方向）',
 '高级JAVA开发工程师',
 'Java工程师',
 '高级JAVA开发工程师',
 'Java开发工程师（初中高级）',
 'JAVA软件开发实习生转正8千起薪',
 'JAVA华锐科技定岗实习生+转正五险一金',
 'java工程师（寿险核心开发）',
 'JAVA高级开发工程师',
 'Java开发工程师',
 'JAVA高级工程师',
 'Java高级工程师',
 'Java工程师',
 'Java 技术经理/技术专家',
 '高级Java工程师',
 'java开发工程师',
 '招聘软件专业 ★应届生实习 ★0基础培养JAVA★ 包住宿',
 'Java高级工程师',
 'Java高级工程师',
 'java高级工程师',
 'Web前端开发（Java基础）',
 'Cisco Software Engineer (Java)--英语流利',
 '项目组2 招java软件开发+年底双薪',
 '研发部1 8kJava软件开发工程师',
 'JAVA开发工程师',
 'JAVA开发工程师（石景山区）',
 'Java工程师',
 'java初级开发工程师',
 '转行IT（JAVA软件开发） 双休+4K-6K',
 '诚聘Java开发人员',
 'Java高级开发工程师',
 '急聘JAVA开发工程师

In [34]:
# inputFile = "./job_title/000001_0"
# outputFile1 = "./job_title/000001_0_1"
# outputFile0 = "./job_title/000001_0_0"
inputFile = "./data/NEG_test_1_new.txt"
outputFile1 = "./data/NEG_test_1_new_1_3.txt"
outputFile0 = "./data/NEG_test_1_new_0_3.txt"
solveSingleFile(inputFile, outputFile1, outputFile0, jieba, stopwords, positions_labeled_dict, 4)