In [0]:
import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.preprocessing import normalize

In [0]:
# 导入数据
filename = 'C:/Users/Youmin/Desktop/train_data.sample'

In [0]:
# Step1：
# 数据集基本特征dataframe表格化制作
data_ = []
with open(filename,'r') as f:
    
    lines = f.readlines()
    for line in lines:
        line = line.split(',')
        line[0] = int(line[0]) # Query_id
        
        line[1] = line[1].split(' ') # Query
        line.append(len(line[1])) # line[5] Query_length
        
        line[2] = int(line[2]) # Query title id
        
        line[3] = line[3].split(' ') # title
        line.append(len(line[3])) # line[6] title_length
        line[4] = int(line[4].replace('\t\n','')) #label以及换行缩进符清洗
        data_.append(line)
# 至此，数据包含特征如下6个，除标签
data = pd.DataFrame(data_,columns=['query_id', 'query', 'query_title_id', 'title','label','query_length', 'title_length'])

In [0]:
# 统计特征扩充，groupby扩展
# 统计1：以label和query长度为基础进行划分，统计不同点击情况下不同query长度对title_length的影响
grouped_length = data['title_length'].groupby([data['label'],data['query_length']])
groupby_mean_mat = grouped_length.mean().unstack()

# 统计2：基于query_id的点击率以label和query_id作为基础进行划分，看看不同id被统计的概率是多少
grouped_click_id = data['query_id'].groupby([data['label'],data['query_id']])
groupby_click_mat = grouped_click_id.count().unstack().fillna(0)# 同时进行缺省值处理
clicked = groupby_click_mat.loc[1] # 被点击的数量
groupby_click_sum = groupby_click_mat.sum()
prob_click_id = round(clicked /groupby_click_sum,3)# 保留小数点3位
def click_prob(x):
    return prob_click_id[x]
def group_sum(x):
    return groupby_click_sum[x]
data['click_prob'] = data['query_id'].map(click_prob)
data['group_sum'] = data['query_id'].map(group_sum)

# 这里是一个粗概率，目前不考虑用标准化处理，希望模型可以通过参数自动学习
data['0_prob_titlen'] = data['query_length'].map(lambda x: groupby_mean_mat[x][0])
data['1_prob_titlen'] = data['query_length'].map(lambda x: groupby_mean_mat[x][1])

    # 构建groupby表格，考虑对列做标准化处理，表格表示形式如下：
#    query_length        1          2          3    ...   55    198        300
#    label                                          ...                       
#    0             12.868735  13.179517  13.449843  ...  17.0  11.0  14.300000
#    1             13.296875  13.525405  13.538568  ...  12.0   4.0  13.666667
# 至此，统计特征制作完毕

NameError: ignored

In [0]:
'''
#基于embedding的距离特征
'''

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from scipy.linalg import norm
import gensim

In [0]:
# 1.杰卡德系数计算
def jaccard_similarity(s1,s2):
    def add_space(s):
        return ' '.join(list(s))
    s1,s2 = add_space(s1),add_space(s2)
    cv = CountVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1,s2]
    vectors = cv.fit_transform(corpus).toarray()
    numerator = np.sum(np.min(vectors, axis = 0))
    denominator = np.sum(np.max(vectors, axis = 0))
    return 1.0* numerator/denominator

In [0]:
# 2.TF距离计算
def tf_similarity(s1, s2):
    def add_space(s):
        return ' '.join(list(s))
    
    s1, s2 = add_space(s1), add_space(s2)
    cv = CountVectorizer(tokenizer = lambda s: s.split())
    corpus = [s1,s2]
    vectors = cv.fit_transform(corpus).toarray()
    return np.dot(vectors[0], vectors[1])/(norm(vectors[0])*norm(vectors[1]))

In [0]:
# 3. TF-IDF计算
def tfidf_similarity(s1, s2):
    def add_space(s):
        return ' '.join(list(s))
    s1,s2 = add_space(s1),add_space(s2)
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1,s2]
    vectors = cv.fit_transform(corpus).toarray()
    return np.dot(vectors[0],vectors[1])/(norm(vectors[0])*norm(vectors[1]))  

In [0]:
# 4. word2vec计算
#导入模型
#model_file = 'C:/Users/Youmin/Desktop/w2vmodel_title'
#model = gensim.models.Word2Vec.load(model_file)
#
#def vector_similariy(s1,s2):
#    def sentence_vector(s):
#        words = 原来用的结巴.lcut(s)
#        v = np.zeros(64)
#        for word in words:
#            v += model[word]
#        v /= len(words)
#        return v
#    v1, v2 = sentence_vector(s1), sentence_vector(s2)
#    return np.dot(v1, v2) / (norm(v1)*norm(v2))

In [0]:
def f(x):
    # 将标签为1的各个query_id找出来匹配上，输入的m是query_id的下标
    l = data[(data['query_id']==m)&(data['label']==1)].index.tolist() #这里l返回的是data中index下标
    jaccard_dist = []
    tf_dist = []
    tfidf_dist = []
    for i in range(len(l)):
        #jaccard
        j_dist = jaccard_similarity(x,data['title'][l[i]])
        jaccard_dist.append(j_dist)
        
        #TF
        TF_dist = tf_similarity(x,data['title'][l[i]])
        tf_dist.append(TF_dist)
        
        #TF-IDF
        TfIdf_dist = tfidf_similarity(x,data['title'][l[i]])
        tfidf_dist.append(TfIdf_dist)
        
    jaccard = np.mean(jaccard_dist)
    tf_ = np.mean(tf_dist)
    tfidf = np.mean(tfidf_dist)
    return jaccard,tf_,tfidf
    #返回三个距离均值
    
#因此这里的关键是找出m
temp = pd.Series()
for m in range(len(data['query_id'].groupby(data['query_id']).count())):
    m = m+1
    dist_seq = data['title'][(data['query_id']==m)].map(f)# 这是x
    temp = pd.concat([temp,dist_seq])
#    jac = dist_seq['title'].map(lambda x:x[0])
#    tf = dist_seq['title'].map(lambda x:x[1])
#    idf = dist_seq['title'].map(lambda x:x[2])
#    
#    pd.concat([jac,tf,idf],axis=1)
i = 0
data['jaccard_dist'] = temp.map(lambda x: x[i])
i = 1
data['tf_dist'] = temp.map(lambda x: x[i])
i = 2
data['tfidf_dist'] = temp.map(lambda x: x[i])

In [0]:
# 针对数值型数据占用内存 减少内存调用方法
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [0]:
# 语料特征构建

from gensim import corpora
from gensim import models

# 设置构建语料库的数据集大小
'''
  在这里设置语料库的提取材料大小
'''
texts = data['title']# [给他一个范围]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf_corpus = models.TfidfModel(corpus)
# 至此语料库就是tfidf_corpus
# tfidf_corpus[xxxxx]
#def corpus_map(x):
#    l = dictionary.doc2bow(x)
#    return tfidf_corpus[l]
#data['title_corpus'] = data['title'].map(corpus_map)


# LDA corpus
# set the topic words components
num_topic = 100
from gensim.models.ldamodel import LdaModel
lda = LdaModel(corpus,num_topics=num_topic)
def lda_map(x):
    l = dictionary.doc2bow(x)
    return lda[l]
temp_ = data['title'].map(lda_map) # 这里是对整个数据集做映射
temp_ = pd.DataFrame(dict([ (k,pd.Series(v).map(list)) for k,v in temp_.iteritems() ])).fillna(0).T#
l_ = list(range(num_topic))
l = [str(x) for x in l_]
temp_.columns = l

for i in range(len(temp_)):
    temp_.iloc[i] = temp_.iloc[i].map(lambda x: x[1] if x!=0 else 0)

data = pd.concat([data,temp_],axis=1)

'''


  至此所有数据特征构建完毕



'''

In [0]:
'''

记得划分数据集用来提炼语料库，大致为1000万文本提100个关键词

其次是训练过程中，将1亿样本分批抽样训练10组，每组1000+100

'''

In [0]:
# 模型输出部分
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data_input = data.drop(['label','query','title'],axis=1).values
target = data['label'].values

X_train,X_dev,y_train,y_dev = train_test_split(data_input,target,test_size=0.2)


# lightgbm
param = {
    'num_leaves':150, 
    'objective':'binary',
    'max_depth':7,
    'learning_rate':.05,
    'max_bin':200}
param['metric'] = ['auc', 'binary_logloss']

train_data = lgb.Dataset(X_train,label=y_train)

num_round = 50
lgbm = lgb.train(param,train_data,num_round)


ypred2 = lgbm.predict(X_dev)

output = pd.DataFrame()
output['query_id'] = X_dev[:,0]
output['query_title_id'] = X_dev[:,1]
output['prediction'] = ypred2
output = output.sort_values(by=['query_id','query_title_id'])
output.index = range(len(output))

output.to_csv('.test.csv',index=False,header=False,sep=",")