sudo rm -f /etc/boto.cfg

In [1]:
import pandas as pd
import numpy as np
import nltk
import gensim
import Levenshtein as L

In [2]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz
from scipy.spatial.distance import cosine, jaccard, euclidean
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import skew, kurtosis
from tqdm import tqdm

  from numpy.core.umath_tests import inner1d


In [3]:
original_data = pd.read_csv("questions.csv")

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [5]:
stop_words = stopwords.words('english')

In [6]:
original_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
print(len(original_data))

404351


In [8]:
original_data.dropna(axis=0, how="any", inplace=True)

In [9]:
print(len(original_data))

404348


In [10]:
all_label = original_data.is_duplicate.tolist()
all_data = original_data[['question1', 'question2']].values

In [11]:
train_data, test_data, train_label, test_label = train_test_split(all_data, all_label, 
                                                                  test_size=0.15, stratify=all_label,
                                                                  random_state=42)

In [None]:
np.save("train_data.npy", train_data)
np.save("test_data.npy", test_data)
np.save("train_label.npy", train_label)
np.save("test_label.npy", test_label)

In [12]:
def sentence2vec(s):
    words = s.lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [13]:
def feature_extract(data):
    feats = []
    for i, (q1, q2) in enumerate(tqdm(data)):
        #try:
        q1_token = word_tokenize(q1)
        q2_token = word_tokenize(q2)
        q1_pos_tag = [k for t, k in pos_tag(q1_token)]
        q2_pos_tag = [k for t, k in pos_tag(q2_token)]
        #print(q1pos_tag)
        #print(q2pos_tag)
            
        q1_embedding = []
        for t in q1_token:
            try:
                q1_embedding.append(model[t])
            except:
                continue
        q2_embedding = []
        for t in q2_token:
            try:
                q2_embedding.append(model[t])
            except:
                continue
                    
        q1_embedding = np.mean(np.array(np.nan_to_num(q1_embedding)), axis=0)
        q2_embedding = np.mean(np.array(np.nan_to_num(q2_embedding)), axis=0)
            
        len_q1 = len(q1_token)
        len_q2 = len(q2_token)
        num_common_tags = len(set(q1_pos_tag).intersection(set(q2_pos_tag)))
        num_common_words = len(set(q1.lower().split(' ')).intersection(set(q2.lower().split(' '))))
            
        L_words_dist = L.distance(q1, q2)
        L_tag_dist = L.distance(" ".join(q1_pos_tag), " ".join(q2_pos_tag))
            
        embedding_cos_sim = cosine(np.nan_to_num(q1_embedding), np.nan_to_num(q2_embedding))
        embedding_euclidean = euclidean(np.nan_to_num(q1_embedding), np.nan_to_num(q2_embedding))
        embedding_jaccard = jaccard(np.nan_to_num(q1_embedding), np.nan_to_num(q2_embedding))
                
        q_ratio = fuzz.QRatio(q1, q2)
        wr_ratio = fuzz.WRatio(q1, q2)
        partial_ratio = fuzz.partial_ratio(q1, q2)
            
        q1_kur = kurtosis(np.nan_to_num(q1_embedding))
        q2_kur = kurtosis(np.nan_to_num(q2_embedding))
        q1_skew = skew(np.nan_to_num(q1_embedding))
        q2_skew = skew(np.nan_to_num(q2_embedding))
        
        feats.append([len_q1, len_q2, num_common_tags, num_common_words, 
                      embedding_cos_sim, embedding_euclidean, embedding_jaccard,
                      L_words_dist, L_tag_dist,
                      q_ratio, wr_ratio, partial_ratio,
                      q1_kur, q2_kur, q1_skew, q2_skew]) 
        #except:
        #    print(q1)
        #    print(q2)

    return np.array(feats)

In [14]:
train_feats = feature_extract(train_data)
test_feats = feature_extract(test_data)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 343695/343695 [21:09<00:00, 270.75it/s]
100%|██████████| 60653/60653 [03:43<00:00, 271.87it/s]


In [15]:
print(len(train_feats))
print(len(train_label))

343695
343695


In [16]:
print(len(test_feats))
print(len(test_label))

60653
60653


In [17]:
RandomForest = RandomForestClassifier(max_depth=5, n_estimators=100, max_features=7)

In [19]:
print(train_feats)

[[ 6.00000000e+00  4.00000000e+00  4.00000000e+00 ... -1.01289021e-01
   1.10726163e-01  1.92913078e-02]
 [ 1.10000000e+01  1.00000000e+01  1.00000000e+01 ...  3.37087552e-01
   1.56232789e-01  1.12447046e-01]
 [ 8.00000000e+00  8.00000000e+00  8.00000000e+00 ... -3.26335128e-01
   2.03394350e-02  4.99404781e-02]
 ...
 [ 9.00000000e+00  1.00000000e+01  6.00000000e+00 ... -6.10099738e-03
   8.74593947e-03  4.18575145e-02]
 [ 1.30000000e+01  1.10000000e+01  7.00000000e+00 ... -8.63220200e-02
   5.09598106e-02  6.46400265e-03]
 [ 1.10000000e+01  1.10000000e+01  7.00000000e+00 ... -2.71655870e-01
  -1.56343251e-01 -1.25389889e-01]]


In [20]:
RandomForest.fit(np.nan_to_num(train_feats), train_label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
RandomForest.score(np.nan_to_num(test_feats), test_label)