# Extracting new features to measure similarities
The model_1 notebook test some simple measure of similarities. <br>
This notebook will focus on a broad range of measures of similarity between sentences <br>
My reference is here: https://github.com/qqgeogor/kaggle_quora_benchmark

## preprocessing

In model_1, we only substitute the special characters. Here we will add the stemming into the processing where different forms of words will be reduced to their stem format, like "Hits" and "Hitting" will all be "Hit".

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
import re
import difflib
from nltk.corpus import stopwords
import nltk
import distance
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
import pickle
from scipy.spatial.distance import cosine

# stops = set(stopwords.words("english"))
stops = ["a", "and", "of", "the", "to", "on", "in", "at", "is"]
seed = 1024
np.random.seed(seed)

In [2]:
porter = PorterStemmer()

In [14]:
def txt_clean_no_stemming(sentence):
    """
    no stemming, only stopwords
    """
    s = re.sub("[^a-zA-Z0-9]", " ", str(sentence))
    s_list = s.lower().split(" ")
    s_list = [w for w in s_list if w not in stops]
    return " ".join(s_list)

def txt_clean_stemming(sentence, st=PorterStemmer()):
    s = re.sub("[^a-zA-Z0-9]", " ", str(sentence))
    s_list = s.lower().split(" ")
    s_list = [st.stem(w) for w in s_list if w not in stops]
    return " ".join(s_list)

In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [15]:
# # stemming
train["q1_stem"] = train.apply(lambda x: txt_clean_stemming(x["question1"]),axis = 1)
train["q2_stem"] = train.apply(lambda x: txt_clean_stemming(x["question2"]),axis = 1)

test["q1_stem"] = test.apply(lambda x: txt_clean_stemming(x["question1"]),axis = 1)
test["q2_stem"] = test.apply(lambda x: txt_clean_stemming(x["question2"]),axis = 1)

# no stemming
train["question1"] = train.apply(lambda x: txt_clean_no_stemming(x["question1"]),axis = 1)
train["question2"] = train.apply(lambda x: txt_clean_no_stemming(x["question2"]),axis = 1)

test["question1"] = test.apply(lambda x: txt_clean_no_stemming(x["question1"]),axis = 1)
test["question2"] = test.apply(lambda x: txt_clean_no_stemming(x["question2"]),axis = 1)

In [17]:
# store data
train.to_csv("../data/train_stem.csv")
test.to_csv("../data/test_stem.csv")

In [7]:
# delete train and test
# del train
# del test

In [9]:
# train= pd.read_csv("../data/train_stem.csv")
# test = pd.read_csv("../data/test_stem.csv")

## Similarity

In [11]:
def get_common_ratio(text_a,text_b):
    """
    ratio of overlapping: 2*M/T
    """
    seq1 = str(text_a).split()
    seq2 = str(text_b).split()
    seqMatch = difflib.SequenceMatcher(a=seq1, b=seq2)
    return seqMatch.ratio()

def get_interaction_ratio(text_a,text_b):
    """
    ratio of interactions
    """
    seq1 = set(str(text_a).split())
    seq2 = set(str(text_b).split())
    try:
        seqMatch = float(len(seq1.intersection(seq2)))/len(seq1.union(seq2))
    except ZeroDivisionError:
        seqMatch = 0.0
    return seqMatch

def get_common_bigrams_ratio(text_a,text_b):
    seq1 = [" ".join(i) for i in nltk.ngrams(str(text_a).split(), 2)]
    seq2 = [" ".join(i) for i in nltk.ngrams(str(text_b).split(), 2)]
    seqMatch = difflib.SequenceMatcher(a=seq1, b=seq2)
    return seqMatch.ratio()

In [12]:
# extract similarity
train_common_ratio = train.apply(lambda x: get_common_ratio(x["question1"],x["question2"]),axis = 1)
train_common_ratio_stem = train.apply(lambda x: get_common_ratio(x["q1_stem"],x["q2_stem"]),axis = 1)
train_interaction_ratio = train.apply(lambda x: get_interaction_ratio(x["question1"],x["question2"]),axis = 1)
train_interaction_ratio_stem = train.apply(lambda x: get_interaction_ratio(x["q1_stem"],x["q2_stem"]),axis = 1)
train_common_bigrams_ratio = train.apply(lambda x: get_common_bigrams_ratio(x["question1"],x["question2"]),axis = 1)
train_common_bigrams_ratio_stem = train.apply(lambda x: get_common_bigrams_ratio(x["q1_stem"],x["q2_stem"]),axis = 1)

test_common_ratio = test.apply(lambda x: get_common_ratio(x["question1"],x["question2"]),axis = 1)
test_common_ratio_stem = test.apply(lambda x: get_common_ratio(x["q1_stem"],x["q2_stem"]),axis = 1)
test_interaction_ratio = test.apply(lambda x: get_interaction_ratio(x["question1"],x["question2"]),axis = 1)
test_interaction_ratio_stem = test.apply(lambda x: get_interaction_ratio(x["q1_stem"],x["q2_stem"]),axis = 1)
test_common_bigrams_ratio = test.apply(lambda x: get_common_bigrams_ratio(x["question1"],x["question2"]),axis = 1)
test_common_bigrams_ratio_stem = test.apply(lambda x: get_common_bigrams_ratio(x["q1_stem"],x["q2_stem"]),axis = 1)

In [20]:
train_sim_feat = ["train_common_ratio","train_common_ratio_stem","train_interaction_ratio",
                  'train_interaction_ratio_stem','train_common_bigrams_ratio','train_common_bigrams_ratio_stem']

for feat in train_sim_feat:
    with open("../data/{}.pickle".format(feat) ,"wb") as handler:
        pickle.dump(globals()[feat], handler, protocol=2)

In [21]:
test_sim_feat = ["test_common_ratio","test_common_ratio_stem","test_interaction_ratio",
                  'test_interaction_ratio_stem','test_common_bigrams_ratio','test_common_bigrams_ratio_stem']

for feat in test_sim_feat:
    with open("../data/{}.pickle".format(feat) ,"wb") as handler:
        pickle.dump(globals()[feat], handler, protocol=2)

## Distance

In [22]:
def str_jaccard(str1, str2):
    """
    Similar to the get_common_ratio, but with different T, this measure considers the relative order among words
    """
    str1_list = str1.split(" ")
    str2_list = str2.split(" ")
    res = distance.jaccard(str1_list, str2_list)
    return res

def str_nlevenshtein_1(str1, str2):
    """
    calculate the levenshtein distance
    """
    str1_list = str1.split(" ")
    str2_list = str2.split(" ")
    res = distance.nlevenshtein(str1_list, str2_list,method=1)
    return res

def str_nlevenshtein_2(str1, str2):
    """
    calculate the levenshtein distance
    """
    str1_list = str1.split(" ")
    str2_list = str2.split(" ")
    res = distance.nlevenshtein(str1_list, str2_list,method=2)
    return res

def str_sorensen(str1, str2):
    """
    calculate the levenshtein distance
    """
    str1_list = str1.split(" ")
    str2_list = str2.split(" ")
    res = distance.sorensen(str1_list, str2_list)
    return res

In [23]:
# extract distance
train_jaccard = train.apply(lambda x: str_jaccard(x["question1"],x["question2"]),axis = 1)
train_jaccard_stem = train.apply(lambda x: str_jaccard(x["q1_stem"],x["q2_stem"]),axis = 1)
test_jaccard = test.apply(lambda x: str_jaccard(x["question1"],x["question2"]),axis = 1)
test_jaccard_stem = test.apply(lambda x: str_jaccard(x["q1_stem"],x["q2_stem"]),axis = 1)

train_nlevenshtein_1 = train.apply(lambda x: str_nlevenshtein_1(x["question1"],x["question2"]),axis = 1)
train_nlevenshtein_1_stem = train.apply(lambda x: str_nlevenshtein_1(x["q1_stem"],x["q2_stem"]),axis = 1)
test_nlevenshtein_1 = test.apply(lambda x: str_nlevenshtein_1(x["question1"],x["question2"]),axis = 1)
test_nlevenshtein_1_stem = test.apply(lambda x: str_nlevenshtein_1(x["q1_stem"],x["q2_stem"]),axis = 1)

train_nlevenshtein_2 = train.apply(lambda x: str_nlevenshtein_2(x["question1"],x["question2"]),axis = 1)
train_nlevenshtein_2_stem = train.apply(lambda x: str_nlevenshtein_2(x["q1_stem"],x["q2_stem"]),axis = 1)
test_nlevenshtein_2 = test.apply(lambda x: str_nlevenshtein_2(x["question1"],x["question2"]),axis = 1)
test_nlevenshtein_2_stem = test.apply(lambda x: str_nlevenshtein_2(x["q1_stem"],x["q2_stem"]),axis = 1)

train_sorensen = train.apply(lambda x: str_sorensen(x["question1"],x["question2"]),axis = 1)
train_sorensen_stem = train.apply(lambda x: str_sorensen(x["q1_stem"],x["q2_stem"]),axis = 1)
test_sorensen = test.apply(lambda x: str_sorensen(x["question1"],x["question2"]),axis = 1)
test_sorensen_stem = test.apply(lambda x: str_sorensen(x["q1_stem"],x["q2_stem"]),axis = 1)

In [25]:
train_dist_feat = ["train_jaccard","train_jaccard_stem","train_nlevenshtein_1",
                  'train_nlevenshtein_1_stem','train_nlevenshtein_2','train_nlevenshtein_2_stem',
                  "train_sorensen", "train_sorensen_stem"]

for feat in train_dist_feat:
    with open("../data/{}.pickle".format(feat) ,"wb") as handler:
        pickle.dump(globals()[feat], handler, protocol=2)

In [26]:
test_dist_feat = ["test_jaccard","test_jaccard_stem","test_nlevenshtein_1",
                  'test_nlevenshtein_1_stem','test_nlevenshtein_2','test_nlevenshtein_2_stem',
                  "test_sorensen", "test_sorensen_stem"]

for feat in test_dist_feat:
    with open("../data/{}.pickle".format(feat) ,"wb") as handler:
        pickle.dump(globals()[feat], handler, protocol=2)

## Length

In [27]:
def word_len(string):
    return len(str(string).split())

def char_len(string):
    return len(str(string).replace(" ",""))

def word_len_diff(text_a, text_b):
    return abs(word_len(text_a) - word_len(text_b))

def char_len_diff(text_a, text_b):
    return abs(char_len(text_a) - char_len(text_b))

def word_match_share(text_a,text_b):
    q1words = {}
    q2words = {}
    for word in str(text_a).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(text_b).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [28]:
# extract length
len_feats = ["word_len_diff","char_len_diff","word_match_share"]

for feat in len_feats:
    train[feat] = train.apply(lambda x: globals()[feat](x["question1"],x["question2"]),axis = 1)
    train[feat+"_stem"] = train.apply(lambda x: globals()[feat](x["q1_stem"],x["q2_stem"]),axis = 1)
    test[feat] = test.apply(lambda x: globals()[feat](x["question1"],x["question2"]),axis = 1)
    test[feat+"_stem"] = test.apply(lambda x: globals()[feat](x["q1_stem"],x["q2_stem"]),axis = 1)

for feat in len_feats:
    with open("../data/train_{}.pickle".format(feat) ,"wb") as handler:
        pickle.dump(train[feat], handler, protocol=2)
    with open("../data/train_{}_stem.pickle".format(feat) ,"wb") as handler:
        pickle.dump(train[feat+"_stem"], handler, protocol=2)
    with open("../data/test_{}.pickle".format(feat) ,"wb") as handler:
        pickle.dump(test[feat], handler, protocol=2)
    with open("../data/test_{}_stem.pickle".format(feat) ,"wb") as handler:
        pickle.dump(test[feat+"_stem"], handler, protocol=2)

In [None]:
train.head(2)

## tf-idf

In [18]:
ft = ['question1','question2','q1_stem','q2_stem']
train_input = train[ft]
test_input = test[ft]

# print('Generate tfidf')
# # len_train = train.shape[0]
# data_all = pd.concat([train_input["question1"],train_input["question2"],
#                      test_input["question1"],test_input["question2"]])

# vect_orig = TfidfVectorizer(max_features=None,ngram_range=(1,1), min_df=3)

# corpus = data_all.astype(str).values

# vect_orig.fit(corpus)

# tfidfs = vect_orig.transform(data_all)
# train_tfidf = tfidfs[:train_input.shape[0]*2]
# test_tfidf = tfidfs[2*train_input.shape[0]:]

# train_tfidf_q1 = train_tfidf[:train_input.shape[0]]
# train_tfidf_q2 = train_tfidf[train_input.shape[0]:]
# test_tfidf_q1 = test_tfidf[:train_input.shape[0]]
# test_tfidf_q2 = test_tfidf[train_input.shape[0]:]

# ft = ['question1','question2','q1_stem','q2_stem']
# train_input = train[ft]
# test_input = test[ft]


# print('Generate tfidf stemming')
# # len_train = train.shape[0]
# data_all = pd.concat([train_input['q1_stem'],train_input['q2_stem'],
#                      test_input['q1_stem'],test_input['q2_stem']])

# vect_orig = TfidfVectorizer(max_features=None,ngram_range=(1,1), min_df=3)

# corpus = data_all.astype(str).values

# vect_orig.fit(corpus)

# tfidfs = vect_orig.transform(data_all)
# train_tfidf = tfidfs[:train_input.shape[0]*2]
# test_tfidf = tfidfs[2*train_input.shape[0]:]

# train_tfidf_q1_stem = train_tfidf[:train_input.shape[0]]
# train_tfidf_q2_stem = train_tfidf[train_input.shape[0]:]
# test_tfidf_q1_stem = test_tfidf[:train_input.shape[0]]
# test_tfidf_q2_stem = test_tfidf[train_input.shape[0]:]

In [19]:
def generate_tiidf(q_train,q_test):
    "Make the steps above in the function"
    data_all = pd.concat([q_train.ix[:,0],q_train.ix[:,1],
                     q_test.ix[:,0],q_test.ix[:,1]])
    
    vect_orig = TfidfVectorizer(max_features=None,ngram_range=(1,1), min_df=3)
    corpus = data_all.astype(str).values
    vect_orig.fit(corpus)

    tfidfs = vect_orig.transform(data_all)
    train_tfidf = tfidfs[:q_train.shape[0]*2]
    test_tfidf = tfidfs[2*q_train.shape[0]:]

    train_tfidf_q1 = train_tfidf[:train_input.shape[0]]
    train_tfidf_q2 = train_tfidf[train_input.shape[0]:]
    test_tfidf_q1 = test_tfidf[:test_input.shape[0]]
    test_tfidf_q2 = test_tfidf[test_input.shape[0]:]

    return csr_matrix(train_tfidf_q1), csr_matrix(train_tfidf_q2), \
            csr_matrix(test_tfidf_q1), csr_matrix(test_tfidf_q2)

In [20]:
train_tfidf_q1, train_tfidf_q2, test_tfidf_q1, test_tfidf_q2 = \
    generate_tiidf(train_input[["question1","question2"]],test_input[["question1","question2"]])
    
train_tfidf_q1_stem, train_tfidf_q2_stem, test_tfidf_q1_stem, test_tfidf_q2_stem = \
    generate_tiidf(train_input[["q1_stem","q2_stem"]],test_input[["q1_stem","q2_stem"]])

In [23]:
with open("../data/train_tfidf_q1.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_q1, handler, protocol=2)
with open("../data/train_tfidf_q2.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_q2, handler, protocol=2)
with open("../data/test_tfidf_q1.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_q1, handler, protocol=2)
with open("../data/test_tfidf_q2.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_q2, handler, protocol=2)

In [24]:
with open("../data/train_tfidf_q1_stem.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_q1_stem, handler, protocol=2)
with open("../data/train_tfidf_q2_stem.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_q2_stem, handler, protocol=2)
with open("../data/test_tfidf_q1_stem.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_q1_stem, handler, protocol=2)
with open("../data/test_tfidf_q2_stem.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_q2_stem, handler, protocol=2)

In [4]:
with open("../data/is_duplicate.pickle", "wb") as handler:
        pickle.dump(train["is_duplicate"], handler, protocol=2)

### reload tfidf

cosine distance

In [3]:
train_tfidf = ["train_tfidf_q1","train_tfidf_q2","train_tfidf_q1_stem","train_tfidf_q2_stem"]
for feat in train_tfidf:
    with open("../data/{}.pickle".format(feat), "rb") as handler:
        exec("{} = pickle.load(handler)".format(feat))

train_cos_dist = []
for i in range(train_tfidf_q1.shape[0]):
    dist = cosine(train_tfidf_q1[i,:].toarray(),train_tfidf_q2[i,:].toarray())
    train_cos_dist.append(dist)

train_cos_dist_stem = []
for i in range(train_tfidf_q1_stem.shape[0]):
    dist = cosine(train_tfidf_q1_stem[i,:].toarray(),train_tfidf_q2_stem[i,:].toarray())
    train_cos_dist_stem.append(dist)

with open("../data/train_cos_dist.pickle" ,"wb") as handler:
        pickle.dump(train_cos_dist, handler, protocol=2)

with open("../data/train_cos_dist_stem.pickle" ,"wb") as handler:
        pickle.dump(train_cos_dist_stem , handler, protocol=2)

In [12]:
test_tfidf = ["test_tfidf_q1","test_tfidf_q2","test_tfidf_q1_stem","test_tfidf_q2_stem"]
for feat in test_tfidf:
    with open("../data/{}.pickle".format(feat), "rb") as handler:
        exec("{} = pickle.load(handler)".format(feat))

test_cos_dist = []
for i in range(test_tfidf_q1.shape[0]):
    dist = cosine(test_tfidf_q1[i,:].toarray(),test_tfidf_q2[i,:].toarray())
    test_cos_dist.append(dist)

test_cos_dist_stem = []
for i in range(test_tfidf_q1_stem.shape[0]):
    dist = cosine(test_tfidf_q1_stem[i,:].toarray(),test_tfidf_q2_stem[i,:].toarray())
    test_cos_dist_stem.append(dist)

with open("../data/test_cos_dist.pickle" ,"wb") as handler:
        pickle.dump(test_cos_dist, handler, protocol=2)

with open("../data/test_cos_dist_stem.pickle" ,"wb") as handler:
        pickle.dump(test_cos_dist_stem , handler, protocol=2)

sum diff and mean diff

In [4]:
train_tfidf_sum_dist = []
for i in range(train_tfidf_q1.shape[0]):
    dist = abs(train_tfidf_q1[i,:].toarray().sum()-train_tfidf_q2[i,:].toarray().sum())
    train_tfidf_sum_dist.append(dist)

train_tfidf_sum_dist_stem = []
for i in range(train_tfidf_q1_stem.shape[0]):
    dist = abs(train_tfidf_q1_stem[i,:].toarray().sum()-train_tfidf_q2_stem[i,:].toarray().sum())
    train_tfidf_sum_dist_stem.append(dist)

In [None]:
with open("../data/train_tfidf_sum_dist.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_sum_dist, handler, protocol=2)
with open("../data/train_tfidf_sum_dist_stem.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_sum_dist_stem, handler, protocol=2)

In [13]:
test_tfidf_sum_dist = []
for i in range(test_tfidf_q1.shape[0]):
    dist = abs(test_tfidf_q1[i,:].toarray().sum()-test_tfidf_q2[i,:].toarray().sum())
    test_tfidf_sum_dist.append(dist)

test_tfidf_sum_dist_stem = []
for i in range(test_tfidf_q1_stem.shape[0]):
    dist = abs(test_tfidf_q1_stem[i,:].toarray().sum()-test_tfidf_q2_stem[i,:].toarray().sum())
    test_tfidf_sum_dist_stem.append(dist)

In [14]:
with open("../data/test_tfidf_sum_dist.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_sum_dist, handler, protocol=2)
with open("../data/test_tfidf_sum_dist_stem.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_sum_dist_stem, handler, protocol=2)

sum and mean

In [8]:
train_tfidf_sum_q1 = []
for i in range(train_tfidf_q1.shape[0]):
    dist = train_tfidf_q1[i,:].toarray().sum()
    train_tfidf_sum_q1.append(dist)

train_tfidf_sum_q1_stem = []
for i in range(train_tfidf_q1_stem.shape[0]):
    dist = train_tfidf_q1_stem[i,:].toarray().sum()
    train_tfidf_sum_q1_stem.append(dist)

with open("../data/train_tfidf_sum_q1.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_sum_q1, handler, protocol=2)

with open("../data/train_tfidf_sum_q1_stem.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_sum_q1_stem , handler, protocol=2)

train_tfidf_sum_q2 = []
for i in range(train_tfidf_q2.shape[0]):
    dist = train_tfidf_q2[i,:].toarray().sum()
    train_tfidf_sum_q2.append(dist)

train_tfidf_sum_q2_stem = []
for i in range(train_tfidf_q2_stem.shape[0]):
    dist = train_tfidf_q2_stem[i,:].toarray().sum()
    train_tfidf_sum_q2_stem.append(dist)

with open("../data/train_tfidf_sum_q2.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_sum_q2, handler, protocol=2)

with open("../data/train_tfidf_sum_q2_stem.pickle" ,"wb") as handler:
        pickle.dump(train_tfidf_sum_q2_stem , handler, protocol=2)

In [16]:
test_tfidf_sum_q1 = []
for i in range(test_tfidf_q1.shape[0]):
    dist = test_tfidf_q1[i,:].toarray().sum()
    test_tfidf_sum_q1.append(dist)

test_tfidf_sum_q1_stem = []
for i in range(test_tfidf_q1_stem.shape[0]):
    dist = test_tfidf_q1_stem[i,:].toarray().sum()
    test_tfidf_sum_q1_stem.append(dist)

with open("../data/test_tfidf_sum_q1.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_sum_q1, handler, protocol=2)

with open("../data/test_tfidf_sum_q1_stem.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_sum_q1_stem , handler, protocol=2)

test_tfidf_sum_q2 = []
for i in range(test_tfidf_q2.shape[0]):
    dist = test_tfidf_q2[i,:].toarray().sum()
    test_tfidf_sum_q2.append(dist)

test_tfidf_sum_q2_stem = []
for i in range(test_tfidf_q2_stem.shape[0]):
    dist = test_tfidf_q2_stem[i,:].toarray().sum()
    test_tfidf_sum_q2_stem.append(dist)

with open("../data/test_tfidf_sum_q2.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_sum_q2, handler, protocol=2)

with open("../data/test_tfidf_sum_q2_stem.pickle" ,"wb") as handler:
        pickle.dump(test_tfidf_sum_q2_stem , handler, protocol=2)