## * 数据准备

## * 特征工程
#### - 提取字特征
#### - 提取词特征
#### - 提取图特征
## * 模型建立

# 数据准备

## 将txt文件转换为csv文件

In [1]:
import numpy as np
import pandas as pd
import csv
import gc
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
reader = open('./datasets/char_embed.txt')
data1 = reader.readlines()
with open("./datasets/char_embed.csv","w")as csvfile:
    for row in data1:
        row = row.split()
        writer = csv.writer(csvfile)
        writer.writerow(row)
        
reader = open('./datasets/word_embed.txt')
data2 = reader.readlines()
with open("./datasets/word_embed.csv","w")as csvfile:
    for row in data2:
        row = row.split()
        writer = csv.writer(csvfile)
        writer.writerow(row)

## 读取csv文件，查看内容

In [3]:
train_master = pd.read_csv('./datasets/train.csv')
test_master = pd.read_csv('./datasets/test.csv')
question = pd.read_csv('./datasets/question.csv')
char_embed = pd.read_csv('./datasets/char_embed.csv',header=None)
word_embed = pd.read_csv('./datasets/word_embed.csv',header=None)

In [4]:
train_master.head(3)

Unnamed: 0,label,q1,q2
0,1,Q397345,Q538594
1,0,Q193805,Q699273
2,0,Q085471,Q676160


In [5]:
test_master.head(3)

Unnamed: 0,q1,q2
0,Q017571,Q006012
1,Q728241,Q542572
2,Q166997,Q118270


In [6]:
question.head(3)

Unnamed: 0,qid,words,chars
0,Q000000,W05733 W05284 W09158 W14968 W07863,L1128 L1861 L2218 L1796 L1055 L0847 L2927
1,Q000001,W17378 W17534 W03249 W01490 W18802,L2214 L1980 L0156 L1554 L2218 L1861 L3019 L010...
2,Q000002,W17378 W08158 W20171 W11246 W14759,L2214 L2350 L2568 L1969 L2168 L0694 L3012 L256...


In [7]:
char_embed.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,L0000,-0.54646,2.285091,-3.084309,1.064661,-2.09088,0.651496,-2.429877,-2.262385,-1.981884,...,0.207397,1.476373,0.863744,-0.341826,0.433234,-0.730324,0.215955,-0.528452,-0.340528,-2.018747
1,L0001,-9.016356,-3.801084,-7.210567,3.0529,-1.340958,1.395385,-5.482922,-7.407759,-3.611857,...,-0.818142,4.968217,-4.254042,-0.709047,1.288105,-1.222849,-5.521402,-2.653049,1.868731,2.147064
2,L0002,-0.138824,0.219148,-0.890053,0.106301,-0.494364,0.550745,-0.279467,-0.048898,-0.021813,...,0.247383,0.576698,1.261507,0.446992,-0.418965,-0.278471,1.426156,-0.579678,-0.322354,-0.661302


In [8]:
word_embed.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,W00000,0.169316,-0.063898,0.115286,-0.077671,0.067184,0.019339,0.039596,-0.026229,-0.160078,...,0.061151,0.044519,-0.194827,0.122456,0.122785,-0.154153,-0.116578,-0.127786,0.110593,-0.171084
1,W00001,1.548212,-1.052776,1.192632,0.760363,1.594398,1.478917,-1.555349,0.401968,1.588316,...,-1.898932,0.129864,-2.062325,0.068316,0.540282,-1.68262,-0.81629,-1.464458,-0.361792,0.943322
2,W00002,0.934084,0.106135,-0.391749,-0.209661,-0.558696,-0.942362,-0.274353,-0.232077,-1.024267,...,-0.357264,-0.451105,-0.724659,0.525233,0.290343,0.357838,-0.04275,1.315442,-0.167775,-0.393665


# 特征工程

## 提取train集合字特征

### 数据预处理

In [269]:
train = pd.DataFrame(train_master)
train = pd.merge(train, question, left_on = ['q1'], right_on = ['qid'], how = 'left')
train = pd.merge(train, question, left_on = ['q2'], right_on = ['qid'], how = 'left')
train = train[['label', 'words_x','words_y']]
train.columns = ['label', 'q1', 'q2']

In [270]:
word_embed.index = word_embed[0]
word = word_embed.index.values
word_to_index = dict([(word[i],i) for i in range(len(word))])
index_to_word = dict([(i, word[i]) for i in range(len(word))])

In [271]:
train_q1 = train.q1.values
train_q2 = train.q2.values
train_shape_q1 = train_q1.shape[0]
train_shape_q2 = train_q2.shape[0]
max_len = 20
embed_dim = 300

In [272]:
train_q1_indices = np.zeros((train_shape_q1,max_len))
for i in range(train_shape_q1):
    sentence_words_q1 = train_q1[i].split(' ')
    for j,w in enumerate(sentence_words_q1):
        if j >= max_len:
            break
        train_q1_indices[i, j] = word_to_index[w]

In [273]:
train_q2_indices = np.zeros((train_shape_q2,max_len))
for i in range(train_shape_q2):
    sentence_words_q2 = train_q2[i].split(' ')
    for j,w in enumerate(sentence_words_q2):
        if j >= max_len:
            break
        train_q2_indices[i, j] = word_to_index[w]

### 计算q1，q2长度差异

In [274]:
merge = train[['q1', 'q2']]
q1_len = merge.q1.apply(lambda x: len(x.split(' '))).values
q2_len = merge.q2.apply(lambda x: len(x.split(' '))).values
len_diff = np.abs((q1_len - q2_len))/np.max([q1_len, q2_len], axis=0)
# print(len_diff)

[0.         0.         0.28571429 ... 0.14285714 0.3        0.5       ]


### 计算q1，q2中相同的字的个数

In [276]:
q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values
result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(max(len(q1_word_set), len(q2_word_set)))]
result = pd.DataFrame(result)
result.columns = ['num_common_words']
# print(result)

### 计算共现字比例

In [277]:
ratio = [len(q1_word_set[i] & q2_word_set[i])/max(q1_len[i], q2_len[i]) for i in range(len(q1_word_set))]
ratio = pd.DataFrame(ratio)
result.columns = ['common_word_ratio']
# print(ratio)

### 计算tf-idf字向量

In [278]:
list(merge.q2.values)
sorted(merge.q2.values)
vectorizer = TfidfVectorizer().fit(question.words.values)
q1_tfidf = vectorizer.transform(merge.q1.values)
q2_tfidf = vectorizer.transform(merge.q2.values)

### 根据tf-idf系数调整共现字比例

In [285]:
adjusted_common_word_ratio = []
for i in range(0,q1_tfidf.shape[0]):
    q1words = {}
    q2words = {}
    for word in merge.loc[i, 'q1'].split():
        q1words[word] = q1words.get(word,0)+1
#         print(q1words[word])
    for word in merge.loc[i, 'q2'].split():
        q2words[word] = q2words.get(word,0)+1
#         print(q2words[word])
    sum_shared_word_in_q1 = sum([q1words[w] * q1_tfidf[i,word_to_index[w]] for w in q1words if w in q2words])
#     print(sum_shared_word_in_q1)
    sum_shared_word_in_q2 = sum([q2words[w] * q2_tfidf[i,word_to_index[w]] for w in q2words if w in q1words])
#     print(sum_shared_word_in_q2)
    sum1 = sum(q1words[w] * q1_tfidf[i,word_to_index[w]] for w in q1words if word_to_index[w] != 20890) 
    sum2 = sum(q2words[w] * q2_tfidf[i,word_to_index[w]] for w in q2words)
    sum_tol = sum1 + sum2
#     print(sum_tol)
    if sum_tol<1e-6:
        adjusted_common_word_ratio.append(0.0)
    else:
#         print(sum_shared_word_in_q1)
#         print(sum_shared_word_in_q2)
#         print(sum_tol)
#         print()
        adjusted_common_word_ratio.append(1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2)/sum_tol)
    

### 计算字影响力

In [286]:
words_power = {}
for i in train.index:
    label = int(train.loc[i, 'label'])
    q1_words = train.loc[i, 'q1'].lower().split()
    q2_words = train.loc[i, 'q2'].lower().split()
    all_words = set(q1_words + q2_words)
    q1_words = set(q1_words)
    q2_words = set(q2_words)
    for word in all_words:
        if word not in words_power:
            words_power[word] = [0. for i in range(7)]
        words_power[word][0] += 1.            
        words_power[word][1] += 1.
        if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):
            words_power[word][3] += 1.
            if label == 0:
                words_power[word][2] += 1.
                words_power[word][4] += 1.
        if (word in q1_words) and (word in q2_words):
            words_power[word][5] += 1.
            if label == 1:
                words_power[word][2] += 1.
                words_power[word][6] += 1.
for word in words_power:
    words_power[word][1]  /= train.shape[0]
    words_power[word][2] /= words_power[word][0]
    if words_power[word][3] > 1e-6:
        words_power[word][4] /= words_power[word][3]        
    words_power[word][5] /= words_power[word][0]
sorted_words_power = sorted(words_power.items(), key =lambda d: d[1][0], reverse=True)

### 预测有双侧影响力的字

In [287]:
thresh_num, thresh_rate = 7, 0.3
pword_dside = []
pword = sorted_words_power
pword = filter(lambda x: x[1][0] * x[1][5] >=thresh_num , pword)
pword_sort = sorted(pword, key = lambda d: d[1][6], reverse = True)
pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6]>=thresh_rate, pword_sort)))
merge = train[['q1', 'q2']]
pword_dside_tags = []
for i in merge.index:
    tags = []
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    for word in pword_dside:
        if (word in q1_words) and (word in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    pword_dside_tags.append(tags)

### 预测有单侧影响力的字

In [290]:
thresh_num, thresh_rate = 7, 0.3
pword_oside = []
pword = sorted_words_power
pword = filter(lambda x: x[1][0] * x[1][5] >=thresh_num , pword)
pword_oside.extend(map(lambda x: x[0], filter(lambda x: x[1][4]>thresh_rate, pword)))
merge = train[['q1', 'q2']]
pword_oside_tags = []
for i in merge.index:
    tags = []
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    for word in pword_oside:
        if (word in q1_words) and (word not in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    pword_oside_tags.append(tags)

### 计算单字的双侧影响力

In [291]:
num_least = 300
merge = train[['q1', 'q2']]
words_power = dict(sorted_words_power)
pword_dside_rate = []
for i in merge.index:
    rate = 1.0
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    share_words = list(q1_words.intersection(q2_words))
    for word in share_words:
        if word in pword_dside:
            rate *= (1.0 - words_power[word][6])
    pword_dside_rate.append(1-rate)

### 计算单字的单侧影响力

In [292]:
num_least = 300
merge = train[['q1', 'q2']]
words_power = dict(sorted_words_power)
pword_oside_rate = []
for i in merge.index:
    rate = 1.0
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    q1_diff = list(set(q1_words).difference(set(q2_words)))
    q2_diff = list(set(q2_words).difference(set(q1_words)))
    all_diff = set(q1_diff + q2_diff)
    for word in all_diff:
        if word in pword_oside:
            rate *= (1.0-words_power[word][4])
    pword_oside_rate.append(1-rate)

### 计算可编辑距离

In [293]:
def edit_distance(q1, q2):
    str1 = q1.split(' ')
    str2 = q2.split(' ')
    matrix = [[i+j for j in range(len(str2)+1)] for i in range(len(str1)+1)]
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            if str1[i-1] == str2[j-1]:
                d = 0
            else:
                d = 1
            matrix[i][j] = min(matrix[i-1][j]+1, matrix[i][j-1]+1, matrix[i-1][j-1]+d)
        if i>1 and j >1 and str1[i-1] == str2[j-2] and str1[i-2] == str2[j-1]:
            d = 0
            matrix[i][j] = min(matrix[i][j], matrix[i-2][j-2]+d)
    return matrix[len(str1)][len(str2)]

In [294]:
q1_len = merge['q1'].apply(lambda x: len(x.split(' '))).values
q2_len = merge['q2'].apply(lambda x: len(x.split(' '))).values
dist =[edit_distance(merge.loc[i,'q1'],merge.loc[i,'q2'])/np.max([q1_len,q2_len],axis=0)[i] for i in merge.index]

### 合并提取的字特征

In [295]:
adjusted_common_word_ratio = pd.DataFrame(adjusted_common_word_ratio)
edit_distance = pd.DataFrame(dist)
len_diff = pd.DataFrame(len_diff)
pword_dside_rate = pd.DataFrame(pword_dside_rate)
pword_oside_rate = pd.DataFrame(pword_oside_rate)

In [296]:
train_features = pd.merge(adjusted_common_word_ratio, edit_distance, left_index = True,right_index =  True)
train_features = pd.merge(train_features, len_diff, left_index = True,right_index =  True)
train_features = pd.merge(train_features, pword_dside_rate, left_index = True,right_index =  True)
train_features = pd.merge(train_features, pword_oside_rate, left_index = True,right_index =  True)

In [297]:
train_features.columns = ['adjusted_common_word_ratio','edit_distance','len_diff','pword_dside_rate','pword_oside_rate']

In [298]:
train_features.to_csv('output/train_feature_words.csv')

In [299]:
del train, merge, adjusted_common_word_ratio, edit_distance, len_diff, pword_dside_rate, pword_oside_rate
gc.collect()

98

## 提取词特征

### 数据预处理

In [12]:
train = pd.DataFrame(train_master)
train = pd.merge(train, question, left_on = ['q1'], right_on = ['qid'], how = 'left')
train = pd.merge(train, question, left_on = ['q2'], right_on = ['qid'], how = 'left')
train = train[['label', 'chars_x','chars_y']]
train.columns = ['label', 'q1', 'q2']

In [13]:
char_embed.index = char_embed[0]
char = char_embed.index.values
char_to_index = dict([(char[i],i) for i in range(len(char))])
index_to_char = dict([(i, char[i]) for i in range(len(char))])

In [14]:
train_q1 = train.q1.values
train_q2 = train.q2.values
train_shape_q1 = train_q1.shape[0]
train_shape_q2 = train_q2.shape[0]
max_len = 20
embed_dim = 300

In [15]:
train_q1_indices = np.zeros((train_shape_q1,max_len))
for i in range(train_shape_q1):
    sentence_chars_q1 = train_q1[i].split(' ')
    for j,w in enumerate(sentence_chars_q1):
        if j >= max_len:
            break
        train_q1_indices[i, j] = char_to_index[w]

In [13]:
train_q2_indices = np.zeros((train_shape_q1,max_len))
for i in range(train_shape_q2):
    sentence_chars_q1 = train_q2[i].split(' ')
    for j,w in enumerate(sentence_chars_q1):
        if j >= max_len:
            break
        train_q2_indices[i, j] = char_to_index[w]

### 计算q1，q2长度差异

In [14]:
merge = train[['q1', 'q2']]
q1_len = merge.q1.apply(lambda x: len(x.split(' '))).values
q2_len = merge.q2.apply(lambda x: len(x.split(' '))).values
len_diff = np.abs((q1_len - q2_len))/np.max([q1_len, q2_len], axis=0)
# print(len_diff)

### 计算q1，q2中相同的词的个数

In [15]:
q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values
result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(max(len(q1_word_set), len(q2_word_set)))]
result = pd.DataFrame(result)
result.columns = ['num_common_words']
# print(result)

### 计算共现词比例

In [16]:
ratio = [len(q1_word_set[i] & q2_word_set[i])/max(q1_len[i], q2_len[i]) for i in range(len(q1_word_set))]
ratio = pd.DataFrame(ratio)
result.columns = ['common_word_ratio']
# print(ratio)

### 计算tf-idf词向量

In [17]:
list(merge.q2.values)
sorted(merge.q2.values)
vectorizer = TfidfVectorizer().fit(question.words.values)
q1_tfidf = vectorizer.transform(merge.q1.values)
q2_tfidf = vectorizer.transform(merge.q2.values)

### 根据tf-idf系数调整共现词比例

In [18]:
adjusted_common_word_ratio = []
for i in range(0,q1_tfidf.shape[0]):
    q1words = {}
    q2words = {}
    for word in merge.loc[i, 'q1'].split():
        q1words[word] = q1words.get(word,0)+1
    for word in merge.loc[i, 'q2'].split():
        q2words[word] = q2words.get(word,0)+1
    sum_shared_word_in_q1 = sum([q1words[w] * q1_tfidf[i, char_to_index[w]] for w in q1words if w in q2words])
    sum_shared_word_in_q2 = sum([q2words[w] * q2_tfidf[i,char_to_index[w]] for w in q2words if w in q1words])
    sum1 = sum(q1words[w] * q1_tfidf[i,char_to_index[w]] for w in q1words) 
    sum2 = sum(q2words[w] * q2_tfidf[i,char_to_index[w]] for w in q2words)
    sum_tol = sum1 + sum2
    if sum_tol<1e-6:
        adjusted_common_word_ratio.append(0.0)
    else:
        adjusted_common_word_ratio.append(1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2)/sum_tol)

### 计算词影响力

In [19]:
words_power = {}
for i in train.index:
    label = int(train.loc[i, 'label'])
    q1_words = train.loc[i, 'q1'].lower().split()
    q2_words = train.loc[i, 'q2'].lower().split()
    all_words = set(q1_words + q2_words)
    q1_words = set(q1_words)
    q2_words = set(q2_words)
    for word in all_words:
        if word not in words_power:
            words_power[word] = [0. for i in range(7)]
        words_power[word][0] += 1.            
        words_power[word][1] += 1.
        if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):
            words_power[word][3] += 1.
            if label == 0:
                words_power[word][2] += 1.
                words_power[word][4] += 1.
        if (word in q1_words) and (word in q2_words):
            words_power[word][5] += 1.
            if label == 1:
                words_power[word][2] += 1.
                words_power[word][6] += 1.
for word in words_power:
    words_power[word][1]  /= train.shape[0]
    words_power[word][2] /= words_power[word][0]
    if words_power[word][3] > 1e-6:
        words_power[word][4] /= words_power[word][3]        
    words_power[word][5] /= words_power[word][0]
sorted_words_power = sorted(words_power.items(), key =lambda d: d[1][0], reverse=True)

### 预测有双侧影响力的词

In [20]:
thresh_num, thresh_rate = 7, 0.3
pword_dside = []
pword = sorted_words_power
pword = filter(lambda x: x[1][0] * x[1][5] >=thresh_num , pword)
pword_sort = sorted(pword, key = lambda d: d[1][6], reverse = True)
pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6]>=thresh_rate, pword_sort)))
merge = train[['q1', 'q2']]
pword_dside_tags = []
for i in merge.index:
    tags = []
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    for word in pword_dside:
        if (word in q1_words) and (word in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    pword_dside_tags.append(tags)

### 预测有单侧影响力的词

In [21]:
thresh_num, thresh_rate = 7, 0.3
pword_oside = []
pword = sorted_words_power
pword = filter(lambda x: x[1][0] * x[1][5] >=thresh_num , pword)
pword_oside.extend(map(lambda x: x[0], filter(lambda x: x[1][4]>thresh_rate, pword)))
merge = train[['q1', 'q2']]
pword_oside_tags = []
for i in merge.index:
    tags = []
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    for word in pword_oside:
        if (word in q1_words) and (word not in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    pword_oside_tags.append(tags)

### 计算词的双侧影响力

In [22]:
num_least = 300
merge = train[['q1', 'q2']]
words_power = dict(sorted_words_power)
pword_dside_rate = []
for i in merge.index:
    rate = 1.0
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    share_words = list(q1_words.intersection(q2_words))
    for word in share_words:
        if word in pword_dside:
            rate *= (1.0 - words_power[word][6])
    pword_dside_rate.append(1-rate)

### 预测词的单侧影响力

In [23]:
num_least = 300
merge = train[['q1', 'q2']]
words_power = dict(sorted_words_power)
pword_oside_rate = []
for i in merge.index:
    rate = 1.0
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    q1_diff = list(set(q1_words).difference(set(q2_words)))
    q2_diff = list(set(q2_words).difference(set(q1_words)))
    all_diff = set(q1_diff + q2_diff)
    for word in all_diff:
        if word in pword_oside:
            rate *= (1.0-words_power[word][4])
    pword_oside_rate.append(1-rate)

### 合并提取的词特征

In [24]:
adjusted_common_char_ratio = pd.DataFrame(adjusted_common_word_ratio)
pchar_dside_rate = pd.DataFrame(pword_dside_rate)
pchar_oside_rate = pd.DataFrame(pword_oside_rate)

In [35]:
train_features = pd.read_csv('output/train_feature_words.csv', index_col=0)
train_features = pd.DataFrame(train_features)
train_features = pd.merge(train_features, adjusted_common_char_ratio, left_index = True,right_index =  True)
train_features = pd.merge(train_features, pchar_dside_rate, left_index = True,right_index =  True)
train_features = pd.merge(train_features, pchar_oside_rate, left_index = True,right_index =  True)

In [37]:
train_features.columns = ['adjusted_common_word_ratio','edit_distance','len_diff',
                          'pword_dside_rate','pword_oside_rate','adjusted_common_char_ratio','pchar_dside_rate','pchar_oside_rate']

In [38]:
train_features.to_csv('output/train_feature.csv')

## 提取test集合字特征

### 数据预处理

In [4]:
test = pd.DataFrame(test_master)
test = pd.merge(test, question, left_on = ['q1'], right_on = ['qid'], how = 'left')
test = pd.merge(test, question, left_on = ['q2'], right_on = ['qid'], how = 'left')
test = test[['words_x','words_y']]
test.columns = ['q1', 'q2']

In [7]:
test_q1 = test.q1.values
test_q2 = test.q2.values
test_shape_q1 = test_q1.shape[0]
test_shape_q2 = test_q2.shape[0]
max_len = 20
embed_dim = 300

In [16]:
word_embed.index = word_embed[0]
word = word_embed.index.values
word_to_index = dict([(word[i],i) for i in range(len(word))])
index_to_word = dict([(i, word[i]) for i in range(len(word))])

In [17]:
test_q1_indices = np.zeros((test_shape_q1,max_len))
for i in range(test_shape_q1):
    sentence_words_q1 = test_q1[i].split(' ')
    for j,w in enumerate(sentence_words_q1):
        if j >= max_len:
            break
        test_q1_indices[i, j] = word_to_index[w]

In [18]:
test_q2_indices = np.zeros((test_shape_q2,max_len))
for i in range(test_shape_q2):
    sentence_words_q2 = test_q2[i].split(' ')
    for j,w in enumerate(sentence_words_q2):
        if j >= max_len:
            break
        test_q2_indices[i, j] = word_to_index[w]

### 计算q1，q2长度差异

In [19]:
merge = test[['q1', 'q2']]
q1_len = merge.q1.apply(lambda x: len(x.split(' '))).values
q2_len = merge.q2.apply(lambda x: len(x.split(' '))).values
len_diff = np.abs((q1_len - q2_len))/np.max([q1_len, q2_len], axis=0)
# print(len_diff)

### 计算q1，q2中相同的字的个数

In [20]:
q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values
result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(max(len(q1_word_set), len(q2_word_set)))]
result = pd.DataFrame(result)
result.columns = ['num_common_words']
# print(result)

### 计算共现字比例

In [21]:
ratio = [len(q1_word_set[i] & q2_word_set[i])/max(q1_len[i], q2_len[i]) for i in range(len(q1_word_set))]
ratio = pd.DataFrame(ratio)
result.columns = ['common_word_ratio']
# print(ratio)

### 计算tfi-df字向量

In [22]:
list(merge.q2.values)
sorted(merge.q2.values)
vectorizer = TfidfVectorizer().fit(question.words.values)
q1_tfidf = vectorizer.transform(merge.q1.values)
q2_tfidf = vectorizer.transform(merge.q2.values)

### 根据tf-idf系数调整共现词比例

In [23]:
adjusted_common_word_ratio = []
count =0
for i in range(0,q1_tfidf.shape[0]):
    q1words = {}
    q2words = {}
    for word in merge.loc[i, 'q1'].split():
        q1words[word] = q1words.get(word,0)+1
    for word in merge.loc[i, 'q2'].split():
        q2words[word] = q2words.get(word,0)+1
    sum_shared_word_in_q1 = sum([q1words[w] * q1_tfidf[i,word_to_index[w]] for w in q1words if w in q2words])
    sum_shared_word_in_q2 = sum([q2words[w] * q2_tfidf[i,word_to_index[w]] for w in q2words if w in q1words])
    sum1 = sum(q1words[w] * q1_tfidf[i,word_to_index[w]] for w in q1words) 
    sum2 = sum(q2words[w] * q2_tfidf[i,word_to_index[w]] for w in q2words)
    sum_tol = sum1 + sum2
    if sum_tol<1e-6:
        adjusted_common_word_ratio.append(0.0)
    else:
        adjusted_common_word_ratio.append(1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2)/sum_tol)

### 计算字影响力

In [24]:
words_power = {}
for i in train.index:
    label = int(train.loc[i, 'label'])
    q1_words = train.loc[i, 'q1'].lower().split()
    q2_words = train.loc[i, 'q2'].lower().split()
    all_words = set(q1_words + q2_words)
    q1_words = set(q1_words)
    q2_words = set(q2_words)
    for word in all_words:
        if word not in words_power:
            words_power[word] = [0. for i in range(7)]
        words_power[word][0] += 1.            
        words_power[word][1] += 1.
        if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):
            words_power[word][3] += 1.
            if label == 0:
                words_power[word][2] += 1.
                words_power[word][4] += 1.
        if (word in q1_words) and (word in q2_words):
            words_power[word][5] += 1.
            if label == 1:
                words_power[word][2] += 1.
                words_power[word][6] += 1.
for word in words_power:
    words_power[word][1]  /= train.shape[0]
    words_power[word][2] /= words_power[word][0]
    if words_power[word][3] > 1e-6:
        words_power[word][4] /= words_power[word][3]        
    words_power[word][5] /= words_power[word][0]
sorted_words_power = sorted(words_power.items(), key =lambda d: d[1][0], reverse=True)

### 预测具有双侧影响力的字

In [25]:
thresh_num, thresh_rate = 7, 0.3
pword_dside = []
pword = sorted_words_power
pword = filter(lambda x: x[1][0] * x[1][5] >=thresh_num , pword)
pword_sort = sorted(pword, key = lambda d: d[1][6], reverse = True)
pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6]>=thresh_rate, pword_sort)))
merge = train[['q1', 'q2']]
pword_dside_tags = []
for i in merge.index:
    tags = []
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    for word in pword_dside:
        if (word in q1_words) and (word in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    pword_dside_tags.append(tags)

### 预测具有单侧影响力的字

In [27]:
thresh_num, thresh_rate = 7, 0.3
pword_oside = []
pword = sorted_words_power
pword = filter(lambda x: x[1][0] * x[1][5] >=thresh_num , pword)
pword_oside.extend(map(lambda x: x[0], filter(lambda x: x[1][4]>thresh_rate, pword)))
merge = train[['q1', 'q2']]
pword_oside_tags = []
for i in merge.index:
    tags = []
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    for word in pword_oside:
        if (word in q1_words) and (word not in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    pword_oside_tags.append(tags)

### 计算字的双侧影响力

In [28]:
num_least = 300
merge = train[['q1', 'q2']]
words_power = dict(sorted_words_power)
pword_dside_rate = []
for i in merge.index:
    rate = 1.0
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    share_words = list(q1_words.intersection(q2_words))
    for word in share_words:
        if word in pword_dside:
            rate *= (1.0 - words_power[word][6])
    pword_dside_rate.append(1-rate)

### 计算字的单侧影响力

In [29]:
num_least = 300
merge = train[['q1', 'q2']]
words_power = dict(sorted_words_power)
pword_oside_rate = []
for i in merge.index:
    rate = 1.0
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    q1_diff = list(set(q1_words).difference(set(q2_words)))
    q2_diff = list(set(q2_words).difference(set(q1_words)))
    all_diff = set(q1_diff + q2_diff)
    for word in all_diff:
        if word in pword_oside:
            rate *= (1.0-words_power[word][4])
    pword_oside_rate.append(1-rate)

### 计算可编辑距离

In [31]:
def edit_distance(q1, q2):
    str1 = q1.split(' ')
    str2 = q2.split(' ')
    matrix = [[i+j for j in range(len(str2)+1)] for i in range(len(str1)+1)]
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            if str1[i-1] == str2[j-1]:
                d = 0
            else:
                d = 1
            matrix[i][j] = min(matrix[i-1][j]+1, matrix[i][j-1]+1, matrix[i-1][j-1]+d)
        if i>1 and j >1 and str1[i-1] == str2[j-2] and str1[i-2] == str2[j-1]:
            d = 0
            matrix[i][j] = min(matrix[i][j], matrix[i-2][j-2]+d)
    return matrix[len(str1)][len(str2)]

In [32]:
q1_len = merge['q1'].apply(lambda x: len(x.split(' '))).values
q2_len = merge['q2'].apply(lambda x: len(x.split(' '))).values
dist =[edit_distance(merge.loc[i,'q1'],merge.loc[i,'q2'])/np.max([q1_len,q2_len],axis=0)[i] for i in merge.index]

### 合并提取的字特征

In [33]:
adjusted_common_word_ratio = pd.DataFrame(adjusted_common_word_ratio)
edit_distance = pd.DataFrame(dist)
len_diff = pd.DataFrame(len_diff)
pword_dside_rate = pd.DataFrame(pword_dside_rate)
pword_oside_rate = pd.DataFrame(pword_oside_rate)

In [34]:
test_features = pd.merge(adjusted_common_word_ratio, edit_distance, left_index = True,right_index =  True)
test_features = pd.merge(test_features, len_diff, left_index = True,right_index =  True)
test_features = pd.merge(test_features, pword_dside_rate, left_index = True,right_index =  True)
test_features = pd.merge(test_features, pword_oside_rate, left_index = True,right_index =  True)

In [35]:
test_features.columns = ['adjusted_common_word_ratio','edit_distance','len_diff','pword_dside_rate','pword_oside_rate']

In [36]:
test_features.to_csv('output/test_feature_words.csv')

## 提取test集合字特征

### 数据预处理

In [37]:
test = pd.DataFrame(test_master)
test = pd.merge(test, question, left_on = ['q1'], right_on = ['qid'], how = 'left')
test = pd.merge(test, question, left_on = ['q2'], right_on = ['qid'], how = 'left')

In [38]:
test = test[['chars_x','chars_y']]
test.columns = ['q1', 'q2']

In [39]:
test_q1 = test.q1.values
test_q2 = test.q2.values
test_shape_q1 = test_q1.shape[0]
test_shape_q2 = test_q2.shape[0]

In [40]:
test_q1_indices = np.zeros((test_shape_q1,max_len))
for i in range(test_shape_q1):
    sentence_chars_q1 = test_q1[i].split(' ')
    for j,w in enumerate(sentence_chars_q1):
        if j >= max_len:
            break
        test_q1_indices[i, j] = char_to_index[w]

In [41]:
test_q2_indices = np.zeros((test_shape_q2,max_len))
for i in range(test_shape_q1):
    sentence_chars_q2 = test_q2[i].split(' ')
    for j,w in enumerate(sentence_chars_q1):
        if j >= max_len:
            break
        test_q2_indices[i, j] = char_to_index[w]

### 计算q1，q2长度差异

In [42]:
merge = test[['q1', 'q2']]
q1_len = merge.q1.apply(lambda x: len(x.split(' '))).values
q2_len = merge.q2.apply(lambda x: len(x.split(' '))).values
len_diff = np.abs((q1_len - q2_len))/np.max([q1_len, q2_len], axis=0)
# print(len_diff)

### 计算q1，q2中相同的词的个数

In [43]:
q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values
q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values
result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(max(len(q1_word_set), len(q2_word_set)))]
result = pd.DataFrame(result)
result.columns = ['num_common_words']
# print(result)

### 计算共现词比例

In [44]:
ratio = [len(q1_word_set[i] & q2_word_set[i])/max(q1_len[i], q2_len[i]) for i in range(len(q1_word_set))]
ratio = pd.DataFrame(ratio)
result.columns = ['common_word_ratio']

### 计算tf-idf词向量

In [45]:
list(merge.q2.values)
sorted(merge.q2.values)
vectorizer = TfidfVectorizer().fit(question.words.values)
q1_tfidf = vectorizer.transform(merge.q1.values)
q2_tfidf = vectorizer.transform(merge.q2.values)

### 根据tf-idf系数调整共现词比例

In [46]:
adjusted_common_word_ratio = []
for i in range(0,q1_tfidf.shape[0]):
    q1words = {}
    q2words = {}
    for word in merge.loc[i, 'q1'].split():
        q1words[word] = q1words.get(word,0)+1
    for word in merge.loc[i, 'q2'].split():
        q2words[word] = q2words.get(word,0)+1
    sum_shared_word_in_q1 = sum([q1words[w] * q1_tfidf[i, char_to_index[w]] for w in q1words if w in q2words])
    sum_shared_word_in_q2 = sum([q2words[w] * q2_tfidf[i,char_to_index[w]] for w in q2words if w in q1words])
    sum1 = sum(q1words[w] * q1_tfidf[i,char_to_index[w]] for w in q1words) 
    sum2 = sum(q2words[w] * q2_tfidf[i,char_to_index[w]] for w in q2words)
    sum_tol = sum1 + sum2
    if sum_tol<1e-6:
        adjusted_common_word_ratio.append(0.0)
    else:
        adjusted_common_word_ratio.append(1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2)/sum_tol)

### 计算词影响力

In [48]:
words_power = {}
for i in train.index:
    label = int(train.loc[i, 'label'])
    q1_words = train.loc[i, 'q1'].lower().split()
    q2_words = train.loc[i, 'q2'].lower().split()
    all_words = set(q1_words + q2_words)
    q1_words = set(q1_words)
    q2_words = set(q2_words)
    for word in all_words:
        if word not in words_power:
            words_power[word] = [0. for i in range(7)]
        words_power[word][0] += 1.            
        words_power[word][1] += 1.
        if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):
            words_power[word][3] += 1.
            if label == 0:
                words_power[word][2] += 1.
                words_power[word][4] += 1.
        if (word in q1_words) and (word in q2_words):
            words_power[word][5] += 1.
            if label == 1:
                words_power[word][2] += 1.
                words_power[word][6] += 1.
for word in words_power:
    words_power[word][1]  /= train.shape[0]
    words_power[word][2] /= words_power[word][0]
    if words_power[word][3] > 1e-6:
        words_power[word][4] /= words_power[word][3]        
    words_power[word][5] /= words_power[word][0]
sorted_words_power = sorted(words_power.items(), key =lambda d: d[1][0], reverse=True)

### 预测具有双侧影响力的词

In [49]:
thresh_num, thresh_rate = 7, 0.3
pword_dside = []
pword = sorted_words_power
pword = filter(lambda x: x[1][0] * x[1][5] >=thresh_num , pword)
pword_sort = sorted(pword, key = lambda d: d[1][6], reverse = True)
pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6]>=thresh_rate, pword_sort)))
merge = train[['q1', 'q2']]
pword_dside_tags = []
for i in merge.index:
    tags = []
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    for word in pword_dside:
        if (word in q1_words) and (word in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    pword_dside_tags.append(tags)

### 预测具有单侧影响力的词

In [50]:
thresh_num, thresh_rate = 7, 0.3
pword_oside = []
pword = sorted_words_power
pword = filter(lambda x: x[1][0] * x[1][5] >=thresh_num , pword)
pword_oside.extend(map(lambda x: x[0], filter(lambda x: x[1][4]>thresh_rate, pword)))
merge = train[['q1', 'q2']]
pword_oside_tags = []
for i in merge.index:
    tags = []
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    for word in pword_oside:
        if (word in q1_words) and (word not in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    pword_oside_tags.append(tags)

### 计算词的双侧影响力

In [51]:
num_least = 300
merge = train[['q1', 'q2']]
words_power = dict(sorted_words_power)
pword_dside_rate = []
for i in merge.index:
    rate = 1.0
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    share_words = list(q1_words.intersection(q2_words))
    for word in share_words:
        if word in pword_dside:
            rate *= (1.0 - words_power[word][6])
    pword_dside_rate.append(1-rate)

### 计算词的单侧影响力

In [52]:
num_least = 300
merge = train[['q1', 'q2']]
words_power = dict(sorted_words_power)
pword_oside_rate = []
for i in merge.index:
    rate = 1.0
    q1_words = set(merge.loc[i, 'q1'].lower().split())
    q2_words = set(merge.loc[i, 'q2'].lower().split())
    q1_diff = list(set(q1_words).difference(set(q2_words)))
    q2_diff = list(set(q2_words).difference(set(q1_words)))
    all_diff = set(q1_diff + q2_diff)
    for word in all_diff:
        if word in pword_oside:
            rate *= (1.0-words_power[word][4])
    pword_oside_rate.append(1-rate)

### 合并提取的词特征

In [53]:
adjusted_common_char_ratio = pd.DataFrame(adjusted_common_word_ratio)
pchar_dside_rate = pd.DataFrame(pword_dside_rate)
pchar_oside_rate = pd.DataFrame(pword_oside_rate)

In [54]:
test_features = pd.read_csv('output/test_feature_words.csv', index_col=0)
test_features = pd.DataFrame(test_features)
test_features = pd.merge(test_features, adjusted_common_char_ratio, left_index = True,right_index =  True)
test_features = pd.merge(test_features, pchar_dside_rate, left_index = True,right_index =  True)
test_features = pd.merge(test_features, pchar_oside_rate, left_index = True,right_index =  True)

In [55]:
test_features.columns = ['adjusted_common_word_ratio','edit_distance','len_diff',
                          'pword_dside_rate','pword_oside_rate','adjusted_common_char_ratio','pchar_dside_rate','pchar_oside_rate']

In [56]:
test_features.to_csv('output/test_feature.csv')

In [57]:
test_features.shape

(172956, 8)