In [3]:
import pandas as pd
import numpy as np
import Levenshtein
import distance
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
df = pd.read_csv('./data/atec_nlp_sim_train_all.csv', encoding="utf-8", sep='\t', header=None)
df.columns = ["id", "sentence1", "sentence2", "label"]
df.columns.values


array(['id', 'sentence1', 'sentence2', 'label'], dtype=object)

In [13]:
df.head(5)

Unnamed: 0,id,sentence1,sentence2,label
0,1,怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


In [14]:
df.sentence1.head(5)

0         怎么更改花呗手机号码
1    也开不了花呗，就这样了？完事了
2        花呗冻结以后还能开通吗
3           如何得知关闭借呗
4             花呗扫码付钱
Name: sentence1, dtype: object

In [15]:
def edit_distance(s1, s2):
    return Levenshtein.distance(s1, s2)

In [16]:
# df["edit_distance"] = df.apply(lambda row: edit_distance(row["sentence1"], row["sentence2"]), axis=1)

In [17]:
df.head(5)

Unnamed: 0,id,sentence1,sentence2,label
0,1,怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
3,4,如何得知关闭借呗,想永久关闭借呗,0
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0


In [18]:
def jaro_distance(s1, s2):
    return Levenshtein.jaro(s1, s2)

In [19]:
def jaro_winkler_distance(s1, s2):
    return Levenshtein.jaro_winkler(s1, s2)

In [20]:
df["jaro_distance"] = df.apply(
    lambda row: jaro_distance(row["sentence1"], row["sentence2"]), axis=1)

In [21]:
df.head(5)

Unnamed: 0,id,sentence1,sentence2,label,jaro_distance
0,1,怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1,0.6
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0,0.333333
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0,0.55711
3,4,如何得知关闭借呗,想永久关闭借呗,0,0.690476
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0,0.338384


In [22]:
df["jaro_winkler_distance"] = df.apply(
    lambda row: jaro_winkler_distance(row["sentence1"], row["sentence2"]), axis=1)

In [24]:
df.head(5)

Unnamed: 0,id,sentence1,sentence2,label,jaro_distance,jaro_winkler_distance
0,1,怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1,0.6,0.6
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0,0.333333,0.333333
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0,0.55711,0.55711
3,4,如何得知关闭借呗,想永久关闭借呗,0,0.690476,0.690476
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0,0.338384,0.338384


In [66]:
def nlevenshtein_distance(s1, s2, method=1):
    return distance.nlevenshtein(s1, s2, method=method)

In [67]:
df["nlevenshtein_distance_1"] = df.apply(
    lambda row: nlevenshtein_distance(row["sentence1"], row["sentence2"], method=1), axis=1)
df["nlevenshtein_distance_2"] = df.apply(
    lambda row: nlevenshtein_distance(row["sentence1"], row["sentence2"], method=2), axis=1)

In [68]:
df.head(5)

Unnamed: 0,id,sentence1,sentence2,label,jaro_distance,jaro_winkler_distance,nlevenshtein_distance_1,nlevenshtein_distance_2
0,1,怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1,0.6,0.6,0.8,0.774194
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0,0.333333,0.333333,0.933333,0.933333
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0,0.55711,0.55711,0.846154,0.846154
3,4,如何得知关闭借呗,想永久关闭借呗,0,0.690476,0.690476,0.5,0.5
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0,0.338384,0.338384,0.909091,0.909091


In [25]:
def jaccard_distance(s1, s2):
    return distance.jaccard(s1, s2)

In [29]:
df["jaccard_distance"] = df.apply(
    lambda row: jaccard_distance(row["sentence1"], row["sentence2"]), axis=1)

In [30]:
df.head(5)


Unnamed: 0,id,sentence1,sentence2,label,jaro_distance,jaro_winkler_distance,jaccard_distance
0,1,怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1,0.6,0.6,0.545455
1,2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0,0.333333,0.333333,0.789474
2,3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0,0.55711,0.55711,0.666667
3,4,如何得知关闭借呗,想永久关闭借呗,0,0.690476,0.690476,0.636364
4,5,花呗扫码付钱,二维码扫描可以用花呗吗,0,0.338384,0.338384,0.692308


In [74]:
df.to_csv("./data/train_with_string_distance.csv", sep='\t', index=False, header=False, encoding="utf-8")