In [1]:
import pandas as pd
import numpy as np
import gensim
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [17]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Word Mover Distance with WordNetLemmatizer and Stopwords removal

In [0]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    s1 = [lemmatizer.lemmatize(w) for w in s1 if w not in stopwords.words('english')]
    s2 = [lemmatizer.lemmatize(w) for w in s2 if w not in stopwords.words('english')]
    return model.wmdistance(s1, s2)

### Sentence to Vector

In [0]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [6]:
data = pd.read_csv('drive/My Drive/semantic-question-matching/quora_duplicate_questions.tsv', sep='\t')
data = data.drop(['id', 'qid1', 'qid2'], axis=1)
data.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [0]:
model = gensim.models.KeyedVectors.load_word2vec_format('drive/My Drive/semantic-question-matching/GoogleNews-vectors-negative300.bin.gz', binary=True)
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

In [8]:
data.head()

Unnamed: 0,question1,question2,is_duplicate,wmd
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.564615
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,3.772346
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1.780585
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,3.741994
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3.659165


In [0]:
model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

In [11]:
data.head()

Unnamed: 0,question1,question2,is_duplicate,wmd,norm_wmd
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.564615,0.217555
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,3.772346,1.368796
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1.780585,0.639209
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,3.741994,1.263719
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3.659165,1.240908


In [0]:
data.to_csv('feature_II.csv', index=False)

In [0]:
question1_vectors = np.zeros((data.shape[0], 300))
question2_vectors  = np.zeros((data.shape[0], 300))
error_count = 0

In [19]:
for i, q in tqdm(enumerate(data.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

  
404290it [13:26, 501.28it/s]


In [21]:
for i, q in tqdm(enumerate(data.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

  
404290it [13:42, 491.26it/s]


In [22]:
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()


In [23]:
data.head()

Unnamed: 0,question1,question2,is_duplicate,wmd,norm_wmd,cosine_distance,cityblock_distance,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.564615,0.217555,0.037908,3.774843,1.0,75.949318,0.275348,0.125323,0.137314,0.008893,-0.099771,0.108845,0.344742
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,3.772346,1.368796,0.574596,15.130415,1.0,190.766894,1.072004,0.482108,0.648993,0.027151,0.06019,0.310524,0.033802
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1.780585,0.639209,0.215223,8.840496,1.0,135.849174,0.656084,0.305829,0.332821,0.247069,0.15255,0.0429,-0.489378
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,3.741994,1.263719,0.635976,15.828719,1.0,190.804061,1.127809,0.501902,0.681132,0.013645,0.027851,-0.230252,-0.243935
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3.659165,1.240908,0.332839,11.447098,1.0,154.884094,0.815891,0.364076,0.447258,-0.136647,0.074702,0.010314,-0.360966


In [0]:
data.to_csv('features_III.csv', index=False)