In [1]:
import pandas as pd
import numpy as np
import nltk
import Levenshtein as L
import pickle
import re

In [3]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import skew, kurtosis
from tqdm import tqdm

In [4]:
with open('data/data_emb', 'rb') as f:
    all_sets, embs, word2idx = pickle.load(f)

In [5]:
idx2word = {value: key for key, value in word2idx.items()}

In [None]:
print(len(word2idx))

In [6]:
train_sets = all_sets[0]
val_sets = all_sets[1]
test_sets = all_sets[2]

In [7]:
train_labels = all_sets[0]['y'] + all_sets[1]['y']
test_labels = all_sets[2]['y']

In [8]:
def average_embedding(tokens):
    embeddings = np.array([embs[t] for t in tokens])
    if len(embeddings) == 0:
        return np.repeat([1e-7], 300)
    else:
        return np.mean(embeddings, axis=0)

In [16]:
def extract_feature(data_sets):
    feats = []
    for q1_token, q2_token in tqdm(zip(data_sets['q1'], data_sets['q2']), total=len(data_sets['q1'])):
        
        q1 = " ".join([idx2word[i] for i in q1_token])
        q2 = " ".join([idx2word[i] for i in q2_token])
        q1 = re.sub(r' ([^a-z0-9])', r'\1', q1)
        q2 = re.sub(r' ([^a-z0-9])', r'\1', q2)
        q1_pos_tag = [k for t, k in pos_tag([idx2word[i] for i in q1_token])]
        q2_pos_tag = [k for t, k in pos_tag([idx2word[i] for i in q2_token])]
    
        
        len_q1 = len(q1)
        len_q2 = len(q2)
        diff_len = len_q1 - len_q2
        num_char1 = len(set(q1))
        num_char2 = len(set(q2))
        num_token1 = len(q1_token)
        num_token2 = len(q2_token)
        num_common_tags = len(set(q1_pos_tag).intersection(set(q2_pos_tag)))
        num_common_tokens = len(set(q1_token).intersection(set(q2_token)))
        L_words_dist = L.distance(q1, q2)
        L_tag_dist = L.distance(" ".join(q1_pos_tag), " ".join(q2_pos_tag))
        
        q_ratio = fuzz.QRatio(q1, q2)
        wr_ratio = fuzz.WRatio(q1, q2)
        partial_ratio = fuzz.partial_ratio(q1, q2)
        token_set_ratio = fuzz.token_set_ratio(q1, q2)
        token_sort_ratio = fuzz.token_sort_ratio(q1, q2)
        partial_token_set_ratio = fuzz.partial_token_set_ratio(q1, q2)
        partial_token_sort_ratio = fuzz.partial_token_sort_ratio(q1, q2)
        
        q1_embedding = average_embedding(q1_token)
        q2_embedding = average_embedding(q2_token)
        
        #cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
        embedding_cos_sim = cosine(q1_embedding, q2_embedding)
        embedding_euclidean = euclidean(q1_embedding, q2_embedding)
        embedding_jaccard = jaccard(q1_embedding, q2_embedding)
        embedding_city = cityblock(q1_embedding, q2_embedding)
        embedding_canberra = canberra(q1_embedding, q2_embedding)
        embedding_minkowski = minkowski(q1_embedding, q2_embedding, 3)
        embeeding_braycurtis = braycurtis(q1_embedding, q2_embedding)
        
        q1_kur = kurtosis(q1_embedding)
        q2_kur = kurtosis(q2_embedding)
        q1_skew = skew(q1_embedding)
        q2_skew = skew(q2_embedding)
        
        feats.append([len_q1, len_q2, diff_len,
                      num_char1, num_char2, num_token1, num_token2,
                      num_common_tags, num_common_tokens,
                      embedding_cos_sim, embedding_euclidean, embedding_jaccard, embedding_city,
                      embedding_canberra, embedding_minkowski, embeeding_braycurtis,
                      L_words_dist, L_tag_dist,
                      q_ratio, wr_ratio, partial_ratio, token_set_ratio, token_sort_ratio,
                      partial_token_set_ratio, partial_token_sort_ratio,
                      q1_kur, q2_kur, q1_skew, q2_skew] + q1_embedding.tolist() + q2_embedding.tolist())
    return feats

In [17]:
train_feats = extract_feature(train_sets) + extract_feature(val_sets)

100%|██████████| 323480/323480 [19:15<00:00, 279.86it/s]
100%|██████████| 40435/40435 [02:24<00:00, 279.56it/s]


In [18]:
test_feats = extract_feature(test_sets)

100%|██████████| 40436/40436 [02:24<00:00, 279.38it/s]


In [22]:
print(len(train_feats[0]))

629


In [26]:
train_feats = np.array(train_feats)
test_feats = np.array(test_feats)

In [31]:
partial_train_feats = train_feats[:, 29:]
partial_test_feats = test_feats[:, 29:]

In [32]:
print(len(partial_test_feats[0]))

600


In [33]:
RandomForest = RandomForestClassifier(max_depth=5, n_estimators=100, max_features=15, verbose=1)

In [None]:
RandomForest.fit(partial_train_feats, train_labels)

In [None]:
RandomForest.score(partial_test_feats, test_labels)

# Random forest
    
    experiment #1
    max_depth = 5, n_estimators=100, max_features=7
    without 2 x 300 word2vec features
    test accuracy: 0.7118409338213473
    
    experiment #2
    max_depth = 5, n_estimators=100, max_features=7
    with 2 x 300 word2vec features
    test accuracy: 0.651498664556336
    
    experiment #3
    max_depth = 5, n_estimators=100, max_features=15
    without 2 x 300 word2vec features
    test accuracy: 0.7117172816302305
    
    experiment #4
    max_depth = 5, n_estimators=100, max_features=15
    2 x 300 word2vec features ONLY
    test accuracy:
    

In [None]:
original_data = pd.read_csv("questions.csv")

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)

In [None]:
stop_words = stopwords.words('english')

In [None]:
original_data.head()

In [None]:
print(len(original_data))

In [None]:
original_data.dropna(axis=0, how="any", inplace=True)

In [None]:
print(len(original_data))

In [None]:
all_label = original_data.is_duplicate.tolist()
all_data = original_data[['question1', 'question2']].values

In [None]:
train_data, test_data, train_label, test_label = train_test_split(all_data, all_label, 
                                                                  test_size=0.1, stratify=all_label,
                                                                  random_state=42)

In [None]:
#train_data, train_label = all_data, all_label

In [None]:
np.save("train_data.npy", train_data)
np.save("test_data.npy", test_data)
np.save("train_label.npy", train_label)
np.save("test_label.npy", test_label)

In [None]:
def sentence2vec(s):
    words = s.lower()
    words = word_tokenize(words)

    #words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []

    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    
    if len(M) == 0:
        return np.repeat(1e-7, 300)
    else:
        
        M = np.array(M)
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())

In [None]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

In [None]:
def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)

In [None]:
def feature_extract(data):
    feats = []
    for i, (q1, q2) in enumerate(tqdm(data)):
        #try:
        q1_token = word_tokenize(q1)
        q2_token = word_tokenize(q2)
        q1_pos_tag = [k for t, k in pos_tag(q1_token)]
        q2_pos_tag = [k for t, k in pos_tag(q2_token)]
        #print(q1pos_tag)
        #print(q2pos_tag)
            
#         q1_embedding = []
#         for t in q1_token:
#             try:
#                 q1_embedding.append(model[t])
#             except:
#                 continue
#         q2_embedding = []
#         for t in q2_token:
#             try:
#                 q2_embedding.append(model[t])
#             except:
#                 continue
                    
#         q1_embedding = np.mean(np.array(np.nan_to_num(q1_embedding)), axis=0)
#         q2_embedding = np.mean(np.array(np.nan_to_num(q2_embedding)), axis=0)
#         q1_embedding = np.nan_to_num(q1_embedding)
#         q2_embedding = np.nan_to_num(q2_embedding)
        
        #q1_embedding = np.nan_to_num(sentence2vec(q1))
        #q2_embedding = np.nan_to_num(sentence2vec(q2))
        q1_embedding = q1_embeddings[i]
        q2_embedding = q2_embeddings[i]
        
        len_q1 = len(q1)
        len_q2 = len(q2)
        num_char1 = len(set(q1))
        num_char2 = len(set(q2))
        len_q1_token = len(q1_token)
        len_q2_token = len(q2_token)
        diff_len = len_q1 - len_q2
        num_common_tags = len(set(q1_pos_tag).intersection(set(q2_pos_tag)))
        num_common_words = len(set(q1.lower().split(' ')).intersection(set(q2.lower().split(' '))))
            
        L_words_dist = L.distance(q1, q2)
        L_tag_dist = L.distance(" ".join(q1_pos_tag), " ".join(q2_pos_tag))
        wmd_dist = wmd(q1, q2)
        norm_wmd_dist = norm_wmd(q1, q2)
            
        #cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
        embedding_cos_sim = cosine(q1_embedding, q2_embedding)
        embedding_euclidean = euclidean(q1_embedding, q2_embedding)
        embedding_jaccard = jaccard(q1_embedding, q2_embedding)
        embedding_city = cityblock(q1_embedding, q2_embedding)
        embedding_canberra = canberra(q1_embedding, q2_embedding)
        embedding_mink = minkowski(q1_embedding, q2_embedding, 3)
        embeeding_braycurtis = braycurtis(q1_embedding, q2_embedding)
                
        q_ratio = fuzz.QRatio(q1, q2)
        wr_ratio = fuzz.WRatio(q1, q2)
        partial_ratio = fuzz.partial_ratio(q1, q2)
        token_set_ratio = fuzz.token_set_ratio(q1, q2)
        token_sort_ratio = fuzz.token_sort_ratio(q1, q2)
        partial_token_set_ratio = fuzz.partial_token_set_ratio(q1, q2)
        partial_token_sort_ratio = fuzz.partial_token_sort_ratio(q1, q2)
        
        q1_kur = kurtosis(q1_embedding)
        q2_kur = kurtosis(q2_embedding)
        q1_skew = skew(q1_embedding)
        q2_skew = skew(q2_embedding)
        
        feats.append([len_q1, len_q2, num_char1, num_char2, len_q1_token, len_q2_token,
                      num_common_tags, num_common_words, diff_len,
                      embedding_cos_sim, embedding_euclidean, embedding_jaccard, embedding_city,
                      embedding_canberra, embedding_mink, embeeding_braycurtis,
                      L_words_dist, L_tag_dist,# wmd_dist, norm_wmd_dist,
                      q_ratio, wr_ratio, partial_ratio, token_set_ratio, token_sort_ratio,
                      partial_token_set_ratio, partial_token_sort_ratio,
                      q1_kur, q2_kur, q1_skew, q2_skew] + q1_embedding.tolist() + q2_embedding.tolist()) 
        #except:
        #    print(q1)
        #    print(q2)

    return np.array(feats)

In [None]:
q1_embeddings = [sentence2vec(q1) for q1, _ in train_data]
q2_embeddings = [sentence2vec(q2) for _, q2 in train_data]

In [None]:
np.save("q1_w2v.npy", q1_embeddings)
np.save("q2_w2v.npy", q2_embeddings)

In [None]:
train_feats = feature_extract(train_data)
test_feats = feature_extract(test_data)

In [None]:
np.any(np.isnan(train_feats))

In [None]:
np.any(np.isfinite(train_feats))

In [None]:
train_feats[train_feats > 1000] = 1e-7
test_feats[test_feats > 1000] = 1e-7

In [None]:
RandomForest = RandomForestClassifier(max_depth=5, n_estimators=100, max_features=7)
RandomForest.fit(train_feats, train_label)
RandomForest.score(test_feats, test_label)

In [None]:
RandomForest.score(test_feats, test_label)

In [None]:
print(len(train_feats[0]))

In [None]:
for i, (q1, q2) in enumerate(tqdm(train_data)):
    if "What would happen if the Indian government" in q1 or "What would happen if the Indian government" in q2:
        print(i)
        print(q1)
        print(q2)

In [None]:
min_max_scaler = MinMaxScaler()
partial_train_feats = min_max_scaler.fit_transform(train_feats[:, :25])
partial_test_feats = min_max_scaler.fit_transform(test_feats[:, :25])

In [None]:
RandomForest.fit(partial_train_feats, train_label)

In [None]:
RandomForest.score(partial_test_feats, test_label)

In [None]:
RandomForest = RandomForestClassifier(max_depth=5, n_estimators=100, max_features=150)
RandomForest.fit(train_feats, train_label)
RandomForest.score(test_feats, test_label)