In [None]:
import pandas as pd
import numpy as np
import time
from fuzzywuzzy import fuzz
import Levenshtein
import pyemd
import gensim
from gensim.models import Word2Vec
from scipy.spatial.distance import minkowski, correlation, cityblock, braycurtis, cosine, euclidean

train_path = "/home/kesci/temporary/train_tail_50M.csv"
train_feature_path = "/home/kesci/work/train_tail_50M_features.csv"

In [None]:
PROCESS_NUM = 7

# ******************** 特征处理接口 ********************
def cal_length(string_query, string_title):
    query_length = len(string_query.replace("\t", "").split(" "))
    title_length = len(string_title.replace("\t", "").split(" "))
    length_dif = abs(query_length - title_length)
    len_ratio = query_length / title_length
    return query_length, title_length, length_dif, len_ratio

def longest_common_subsequence(string_query, string_title):  # 最长公共子序列
    list_query = string_query.replace("\t", "").split(" ")
    list_title = string_title.replace("\t", "").split(" ")

    length_query = len(list_query)
    length_title = len(list_title)
    value = np.zeros((length_query + 1, length_title + 1))
    for i in range(1, length_query + 1):
        for j in range(1, length_title + 1):
            if list_query[i - 1] == list_title[j - 1]:
                value[i][j] = value[i - 1][j - 1] + 1
            else:
                value[i][j] = max(value[i - 1][j], value[i][j - 1])
    max_value = value.max()
    query_start, query_end, title_start, title_end = cal_start_and_end(value, int(max_value))
    return max_value, (query_end - query_start) / length_query, (title_end - title_start) / length_title, max_value/length_title

def cal_start_and_end(value, len):
    query_start = 0
    query_end = 0
    title_start = 0
    title_end = 0
    init = 0
    count = 0
    m, n = value.shape
    for i in range(1, m):
        for j in range(1, n):
            if int(value[i][j]) > init:
                if count == 0:
                    query_start = i
                    title_start = j
                count += 1
                init = int(value[i][j])
                if count == len:
                    query_end = i
                    title_end = j
                    break
    return query_start, query_end, title_start, title_end

def longest_common_substring(string_query, string_title):  # 最长公共子串, 子串起始位置比，子串平均位置比, 所有可能子串能够达到的密度
    list_query = string_query.replace("\t", "").split(" ")
    list_title = string_title.replace("\t", "").split(" ")

    length_query = len(list_query)
    length_title = len(list_title)
    value = np.zeros((length_query + 1, length_title + 1))
    index = np.zeros((length_query + 1, length_title + 1))
    for i in range(1, length_query + 1):
        for j in range(1, length_title + 1):
            if list_query[i - 1] == list_title[j - 1]:
                value[i][j] = value[i - 1][j - 1] + 1
                index[i][j] = index[i - 1][j - 1] + j
            else:
                value[i][j] = 0
                index[i][j] = 0
    max_value = value.max()

    if max_value != 0:
        row = int(np.where(value == np.max(value))[0][0])
        column = int(np.where(value == np.max(value))[1][0])
        start_location = (column - max_value + 1) / length_title
        mean_location = index[row][column] / (max_value * length_title)  # 防止max_value=0造成np.NAN
        rows = np.where(value != 0.0)[0]
        columns = np.where(value != 0.0)[1]
        total_loc = 0
        for i in range(0, len(rows)):
            total_loc += index[rows[i]][columns[i]]
        den_ratio = total_loc / (length_query * length_title)
        lcs_ratio_qlen = max_value / length_query
        lcs_ratio_tlen = max_value / length_title
    else:
        start_location,mean_location,total_loc,den_ratio,lcs_ratio_qlen,lcs_ratio_tlen = 0,0,0,0,0,0
    return max_value, start_location, mean_location, total_loc, den_ratio, lcs_ratio_qlen,lcs_ratio_tlen

def common_words(string_query, string_title): 
    list_query = string_query.replace("\t", "").split(" ")
    list_title = string_title.replace("\t", "").split(" ")
    set_query = set(list_query)
    set_title = set(list_title)
    total_unique_words = len(set_query.union(set_title))

    common_words = [word for word in list_title if word in list_query]
    common_words_set = set(common_words)
    unique_ratio = len(common_words_set)/total_unique_words

    shared_words_in_q1 = [w for w in list_query if w in list_title]
    shared_words_in_q2 = [w for w in list_title if w in list_query]
    qt_length_ratio = (len(shared_words_in_q1) + len(shared_words_in_q2)) / (len(list_query) + len(list_title))
    if len(common_words) > 0:
        com_index1 = len(common_words)
        com_in_q = com_index1/len(list_query)
        com_in_t = com_index1/len(list_title)

        for word in common_words_set:
            index_list = [i for i, x in enumerate(list_query) if x == word]
            com_index1 += sum(index_list)
        query_location = com_index1 / (len(list_query) * len(common_words))
        com_index2 = len(common_words)
        for word in common_words_set:
            index_list = [i for i, x in enumerate(list_title) if x == word]
            com_index2 += sum(index_list)
        title_location = com_index2 / (len(list_title) * len(common_words))

        com_set_in_q = len(common_words_set) / len(set_query)
        com_set_in_t = len(common_words_set) / len(set_title)
        qt_set_ratio = 2 * len(common_words_set) / (len(set_query) + len(set_title))

        com_set_query_index = len(common_words_set)
        for word in common_words_set:
            index_list = [i for i, x in enumerate(list_query) if x == word]
            if len(index_list) > 0:
                com_set_query_index += index_list[0]
        query_set_location = com_set_query_index / (len(list_query) * len(common_words_set))
        com_set_title_index = len(common_words_set)
        for word in common_words_set:
            index_list = [i for i, x in enumerate(list_title) if x == word]
            if len(index_list) > 0:
                com_set_title_index += index_list[0]
        title_set_location = com_set_title_index / (len(list_title) * len(common_words_set))
        set_ratio = (len(common_words_set) / len(common_words))
    else:
        unique_ratio,qt_length_ratio, com_in_q, com_in_t, query_location, title_location, com_set_in_q, com_set_in_t, qt_set_ratio, query_set_location, title_set_location, set_ratio = 0,0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    return unique_ratio,qt_length_ratio, com_in_q, com_in_t, query_location, title_location, com_set_in_q, com_set_in_t, qt_set_ratio, query_set_location, title_set_location, set_ratio

def cal_fuzz_ratio(string_query, string_title):  # fuzz特征
    token_sort_value = fuzz.token_sort_ratio(string_query, string_title)
    token_set_value = fuzz.token_set_ratio(string_query, string_title)
    return token_sort_value, token_set_value

def cal_levenshtein(string_query, string_title):  # levenshtein特征
    leven_jaro = Levenshtein.jaro(string_query, string_title)
    leven_distance = Levenshtein.distance(string_query, string_title)
    leven_ratio = Levenshtein.ratio(string_query, string_title)
    return leven_jaro, leven_distance, leven_ratio

def jaccard_sim(string_query, string_title):
    list_query = string_query.replace("\t", "").split(" ")
    list_title = string_title.replace("\t", "").split(" ")
    unions = len(set(list_query).union(set(list_title)))
    intersections = len(set(list_query).intersection(set(list_title)))
    return intersections / unions

def cal_words_movement(string_query, string_title):
    list_query = string_query.replace("\t", "").split(" ")
    list_title = string_title.replace("\t", "").split(" ")
    words_value = word_vectors.wmdistance(list_query, list_title)
    return words_value
def extract_unique_words(string_query, string_title): 
    list_query = string_query.replace("\t", "").split(" ")
    list_title = string_title.replace("\t", "").split(" ")
    
    query_words = [word for word in list_query if word not in list_title]
    title_words = [word for word in list_title if word not in list_query]
    return query_words, title_words
def cal_nlp_sim(string_query, string_title):
    list_query = string_query.replace("\t", "").split(" ")
    list_title = string_title.replace("\t", "").split(" ")
    M_query = []
    for word in list_query:
        if word_vectors.__contains__(word):
            M_query.append(word_vectors.__getitem__(word))
    M_query = np.array(M_query)
    if len(M_query) != 0:
        q_sentence_vec = M_query.sum(axis=0) / M_query.shape[0]
    else:
        q_sentence_vec = np.zeros((150,), dtype=np.float32)

    M_title = []
    for word in list_title:
        if word_vectors.__contains__(word):
            M_title.append(word_vectors.__getitem__(word))
    M_title = np.array(M_title)
    if len(M_title) != 0:
        t_sentence_vec = M_title.sum(axis=0) / M_title.shape[0]
    else:
        t_sentence_vec = np.zeros((150,), dtype=np.float32)
    dot_product = np.sum(np.multiply(q_sentence_vec, t_sentence_vec))  # 点积
    braycurtis_value = braycurtis(q_sentence_vec, t_sentence_vec)
    cityblock_value = cityblock(q_sentence_vec, t_sentence_vec)
    correlation_value = correlation(q_sentence_vec, t_sentence_vec)
    cosine_value = cosine(q_sentence_vec, t_sentence_vec)
    euclidean_value = euclidean(q_sentence_vec, t_sentence_vec)
    minkowski_value = minkowski(q_sentence_vec, t_sentence_vec, p=3)
    return dot_product, braycurtis_value, cityblock_value, correlation_value, cosine_value, euclidean_value, minkowski_value

def cal_nlp_dis(string_query, string_title):
    list_query, list_title = extract_unique_words(string_query, string_title)
    M_query = []
    for word in list_query:
        if word_vectors.__contains__(word):
            M_query.append(word_vectors.__getitem__(word))
    M_query = np.array(M_query)
    if len(M_query) != 0:
        q_sentence_vec = M_query.sum(axis=0) / M_query.shape[0]
    else:
        q_sentence_vec = np.zeros((150,), dtype=np.float32)
    M_title = []
    for word in list_title:
        if word_vectors.__contains__(word):
            M_title.append(word_vectors.__getitem__(word))
    M_title = np.array(M_title)
    if len(M_title) != 0:
        t_sentence_vec = M_title.sum(axis=0) / M_title.shape[0]
    else:
        t_sentence_vec = np.zeros((150,), dtype=np.float32)
    dis_dot_product = np.sum(np.multiply(q_sentence_vec, t_sentence_vec))  # 点积
    dis_braycurtis_value = braycurtis(q_sentence_vec, t_sentence_vec)
    dis_cityblock_value = cityblock(q_sentence_vec, t_sentence_vec)
    dis_correlation_value = correlation(q_sentence_vec, t_sentence_vec)
    dis_cosine_value = cosine(q_sentence_vec, t_sentence_vec)
    dis_euclidean_value = euclidean(q_sentence_vec, t_sentence_vec)
    dis_minkowski_value = minkowski(q_sentence_vec, t_sentence_vec, p=3)
    if dis_dot_product == 'nan':
        dis_dot_product = 2
    if dis_braycurtis_value == 'nan':
        dis_braycurtis_value = 2
    if dis_cityblock_value == 'nan':
        dis_cityblock_value = 2
    if dis_correlation_value == 'nan':
        dis_correlation_value = 2
    if dis_cosine_value == 'nan':
        dis_cosine_value = 2
    if dis_euclidean_value == 'nan':
        dis_euclidean_value = 2
    if dis_minkowski_value == 'nan':
        dis_minkowski_value = 2
    return dis_dot_product, dis_braycurtis_value, dis_cityblock_value, dis_correlation_value, dis_cosine_value, dis_euclidean_value, dis_minkowski_value

In [None]:
# ******************** 特征抽取模块 ********************
def extract_features(query_title):
    df = pd.DataFrame()
    print("length features")
    df["query_id"] = query_title["query_id"]
    df["query_length"], df["title_length"], df["query_length_sub_title_length"], df["query_length_ratio_title_length"] = zip(
        *query_title.apply(lambda x: cal_length(x["query"], x["title"]), axis=1))

    print("longest common subsequence features")
    df["longest_common_subsequence"],df["longest_common_subsequence_query_ratio"], \
    df["longest_common_subsequence_title_ratio"], df["lcsubsequence_ratio_t_length",] = zip(
        *query_title.apply(lambda x: longest_common_subsequence(x["query"], x["title"]), axis=1))  # 使用多列生成多列

    print("longest common substring features")
    df["longest_common_substring"],df["lcs_start_location_ratio"], df["lcs_mean_location_ratio"], df["lcs_total_loc_ratio"], \
    df["lcs_dense_ratio"],df["lcstring_ratio_q_length"],df["lcstring_ratio_t_length"] = zip(
        *query_title.apply(lambda x: longest_common_substring(x["query"], x["title"]), axis=1))  # 使用多列生成多列

    print("common words features")
    df["unique_ratio"],\
    df["share_words_qt_length_ratio"], df["query_common_words_len_ratio"], df["title_common_words_len_ratio"], \
    df["query_common_words_loc_ratio"], df["title_common_words_loc_ratio"], \
    df["query_common_set_ratio"], df["title_common_set_ratio"], df["query_title_set_ratio"], \
    df["query_common_set_loc_ratio"], df["title_common_set_loc_ratio"], df["common_set_ratio"] = zip(
        *query_title.apply(lambda x: common_words(x["query"], x["title"]), axis=1))

    print("nlp features")
    df["fuzz_token_sort_ratio"], df["fuzz_token_set_ratio"] = zip(
        *query_title.apply(lambda x: cal_fuzz_ratio(x["query"], x["title"]), axis=1))
    df["levenshtein_jaro"], df["levenshtein_distance"], df["levenshtein_ratio"] = zip(
        *query_title.apply(lambda x: cal_levenshtein(x["query"], x["title"]), axis=1))
    df["jaccard_sim"] = query_title.apply(lambda x: jaccard_sim(x["query"], x["title"]), axis=1)
    df["words_movement_distance"] = query_title.apply(lambda x: cal_words_movement(x["query"], x["title"]), axis=1)
    df["dot_product"], df["braycurtis"], df["cityblock"], df["correlation"], df["cosine"], df["euclidean"], df["minkowski"] = zip(
        *query_title.apply(lambda x: cal_nlp_sim(x["query"], x["title"]), axis=1))
    df["dis_dot_product"], df["dis_braycurtis"], df["dis_cityblock"], df["dis_correlation"], df["dis_cosine"], df["dis_euclidean"], df["dis_minkowski"] = zip(
        *query_title.apply(lambda x: cal_nlp_dis(x["query"], x["title"]), axis=1))
    return df

In [None]:
# ******************** 内存监控模块 ********************
import psutil
from concurrent.futures import ProcessPoolExecutor, as_completed
def getMemorystate():
    phymem = psutil.virtual_memory()
    line = "Memory: %5s%% %6s/%s" % (
        phymem.percent,
        str(int(phymem.used / 1024 / 1024)) + "M",
        str(int(phymem.total / 1024 / 1024)) + "M")
    return line

# ******************** 多线程模块 ********************
def process_pool_test(data_path, feature_path):
    names = ['query_id',

             'query_length',
             'title_length',

             'longest_common_subsequence','longest_common_subsequence_query_ratio',
             'longest_common_subsequence_title_ratio',"lcsubsequence_ratio_t_length",

             'longest_common_substring','lcs_start_location_ratio','lcs_mean_location_ratio',
             'lcs_total_loc_ratio', 'lcs_dense_ratio','lcstring_ratio_q_length','lcstring_ratio_t_length',

             'unique_ratio',
             'share_words_qt_length_ratio', 'query_common_words_len_ratio', 'title_common_words_len_ratio',
             'query_common_words_loc_ratio', 'title_common_words_loc_ratio',
             'query_common_set_ratio', 'title_common_set_ratio', 'query_title_set_ratio',
             'query_common_set_loc_ratio', 'title_common_set_loc_ratio', 'common_set_ratio',

             'query_nunique_title','title_nunique_query',

             'fuzz_token_sort_ratio','fuzz_token_set_ratio',
             'levenshtein_jaro','levenshtein_distance','levenshtein_ratio',
             'jaccard_sim',
             'words_movement_distance',
             'dot_product','braycurtis','cityblock','correlation','cosine','euclidean','minkowski',
             'dis_dot_product','dis_braycurtis','dis_cityblock','dis_correlation','dis_cosine','dis_euclidean','dis_minkowski']

    df = pd.DataFrame(columns=names)
    df.to_csv(feature_path, index=False, header=True)
    query_title = pd.read_csv(data_path, usecols=[0, 1, 3], chunksize=1000000, header=None,
                              names=["query_id", "query", "title"])
    with ProcessPoolExecutor(PROCESS_NUM) as executor:
        t0 = time.time()
        result = executor.map(extract_features, query_title)
        print('finish extract')
        print(getMemorystate())
        for index, res in enumerate(result):
            print(index)
            res.to_csv(feature_path, mode='a', header=False, index=False)
            print(getMemorystate())
        elapsed = time.time() - t0
        msg = '\njob finished in {:.2f}s'
        print(msg.format(elapsed))

if __name__ == '__main__':
    # ******************** 预处理部分 ********************
    print("******************* peocess train tail 50m *******************")
    process_pool_test(train_path, train_feature_path)

In [None]:
#单进程处理模块
'''
['query_length_sub_title_length',
     'query_length_ratio_title_length',

     'mean_title_length',
     'query_length_ratio_mean_title_length',
     'title_length_ratio_mean_title_length',
     'query_length_sub_mean_title_length',
     'title_length_sub_mean_title_length',
     'query_nunique_title',
     'title_nunique_query'
]
'''
df["query_length_ratio_title_length"] = df["query_length"] / df["title_length"]
df["mean_title_length"] = df.groupby("query_id").title_length.transform("mean")
df["query_length_sub_mean_title_length"] = df["query_length"] - df["mean_title_length"]
df["title_length_sub_mean_title_length"] = df["title_length"] - df["mean_title_length"]
df["query_length_ratio_mean_title_length"] = df["query_length"] / df["mean_title_length"]
df["title_length_ratio_mean_title_length"] = df["title_length"] / df["mean_title_length"]
df["query_nunique_title"] = query_title.groupby("query").title.transform("nunique")
df["title_nunique_query"] = query_title.groupby("title").query.transform("nunique")

In [None]:
# 所有
features_name1 = [
    'query_id', 'title_id', 'label', 'query_length', 'title_length',
    'query_length_sub_title_length', 'query_length_ratio_title_length',
    'mean_title_length', 'query_length_sub_mean_title_length',
    'title_length_sub_mean_title_length',
    'query_length_ratio_mean_title_length',
    'title_length_ratio_mean_title_length', 'query_nunique_title',
    'title_nunique_query', 
    'longest_common_subsequence',
    'longest_common_subsequence_query_ratio',
    'longest_common_subsequence_title_ratio',
    'lcsubsequence_ratio_t_length', 'longest_common_substring',
    'lcs_start_location_ratio', 'lcs_mean_location_ratio',
    'lcs_total_loc_ratio', 'lcs_dense_ratio', 'lcstring_ratio_q_length',
    'lcstring_ratio_t_length', 'unique_ratio',
    'share_words_qt_length_ratio', 
    
    'query_common_words_len_ratio',
    'title_common_words_len_ratio', 
    'query_common_words_loc_ratio',
    'title_common_words_loc_ratio', 
    'query_common_set_ratio',
    'title_common_set_ratio', 
    'query_title_set_ratio',
    'query_common_set_loc_ratio', 
    'title_common_set_loc_ratio',
    'common_set_ratio', 
    
    'fuzz_token_sort_ratio', 'fuzz_token_set_ratio',
    'levenshtein_jaro', 'levenshtein_distance', 'levenshtein_ratio',
    'jaccard_sim', 'words_movement_distance', 'dot_product', 'braycurtis', 'cityblock',
    'correlation', 'cosine', 'euclidean', 'minkowski',
    'dis_dot_product', 'dis_braycurtis', 'dis_cityblock',
    'dis_correlation', 'dis_cosine', 'dis_euclidean', 'dis_minkowski']


# 最好记录
features_name2 = [
    'query_length', 'title_length',
    'query_length_sub_title_length', 'query_length_ratio_title_length',
    'mean_title_length', 'query_length_sub_mean_title_length',
    'title_length_sub_mean_title_length',
    'query_length_ratio_mean_title_length',
    'title_length_ratio_mean_title_length', 'query_nunique_title',
    'title_nunique_query', 
    'longest_common_subsequence',
    'longest_common_subsequence_query_ratio',
    'longest_common_subsequence_title_ratio',
    'lcsubsequence_ratio_t_length', 'longest_common_substring',
    'lcs_start_location_ratio', 'lcs_mean_location_ratio',
    'lcs_total_loc_ratio', 'lcs_dense_ratio', 'lcstring_ratio_q_length',
    'lcstring_ratio_t_length', 'unique_ratio',
    'share_words_qt_length_ratio', 
    
    'query_common_words_len_ratio',
    'title_common_words_len_ratio', 
    'query_common_words_loc_ratio',
    'title_common_words_loc_ratio', 
    'query_common_set_ratio',
    'title_common_set_ratio', 
    'query_title_set_ratio',
    'query_common_set_loc_ratio', 
    'title_common_set_loc_ratio',
    'common_set_ratio', 
    
    'fuzz_token_sort_ratio', 'fuzz_token_set_ratio',
    'levenshtein_jaro', 'levenshtein_distance', 'levenshtein_ratio',
    'jaccard_sim', 'words_movement_distance', 'dot_product', 'braycurtis', 'cityblock',
    'correlation', 'cosine', 'euclidean', 'minkowski',
    'dis_dot_product', 'dis_braycurtis', 'dis_cityblock',
    'dis_correlation', 'dis_cosine', 'dis_euclidean', 'dis_minkowski']

features2 = features[features_name2].values
print(features2.shape)
labels = features['label'].values
print(labels.shape)

In [None]:
# lgb训练
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.externals import joblib

X, val_x, y, val_y = train_test_split(features2, labels, test_size=0.1, random_state=1)
del labels
del features2
print("finished")

print("train model")
t0 = time.time()
gbm = lgb.LGBMClassifier(boosting_type="gbdt", objective="binary", num_leaves=127,
                reg_alpha=5,reg_lambda=5, n_estimators=5000, feature_fraction=0.8, bagging_fraction=0.8,
                subsample_freq=1, learning_rate=0.05, random_state=8012, n_jobs=14)
gbm.fit(X, y, eval_metric=["auc"], eval_set=(val_x, val_y), early_stopping_rounds=35)
print(gbm.feature_importances_)
print(time.time()-t0)

joblib.dump("/home/kesci/temp/lgb_final_model.pkl")

In [None]:
predict_result = gbm.predict_proba(test_features)[:, 1]

In [None]:
predict_result.shape

In [None]:
# 结果写入文件
import csv

with open("/home/kesci/work/score_file/result_lgb_final.csv", "w", newline="") as result_csvfile:
    writer = csv.writer(result_csvfile)
    for (query, title, score) in zip(df_test["query_id"], df_test["title_id"], predict_result):
        writer.writerow([query, title, score])
    print("Finished!!!")

In [None]:
!wc -l /home/kesci/work/score_file/result_lgb_final.csv