In [1]:
import fasttext
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial
import gc
from annoy import AnnoyIndex
import seaborn as sns
from textblob import TextBlob
sns.set()

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# FASTTEXT

In [3]:
file = open("testfilev2.txt", "w") 

In [4]:
for i in range(10000):
    file.write(train.loc[i].question1) 
    file.write(" ; ")
    file.write(train.loc[i].question2)

In [5]:
model = fasttext.train_unsupervised('testfilev2.txt', model='skipgram')

In [6]:
arr = []
for j in range(8):
    q1 = model.get_sentence_vector(train.loc[j].question1)
    question1 = train.loc[j].question1
    q2 = model.get_sentence_vector(train.loc[j].question2)
    question2 = train.loc[j].question2
    result = 1 - spatial.distance.cosine(q1, q2)
    d = {'q1': question1, 'q2': question2, 'result': result}
    arr.append(d)

In [7]:
df = pd.DataFrame(arr)
df

Unnamed: 0,q1,q2,result
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.99419
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.936389
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.959009
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.850718
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.973377
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",0.989243
6,Should I buy tiago?,What keeps childern active and far from phone ...,0.869455
7,How can I be a good geologist?,What should I do to be a great geologist?,0.950964


# ANNOY

In [8]:
l = 1000
f = model.get_sentence_vector(train.loc[1].question1).shape[0]
t = AnnoyIndex(f, 'angular')

In [9]:
train_copy = train.loc[:1000].copy()

def split_into_tokens(message):
    message = unicode(message, 'utf8')  # convert bytes into proper unicode
    return TextBlob(message).words

train_copy['question1'] = train_copy['question1'].apply(split_into_tokens)
train_copy['question2'] = train_copy['question2'].apply(split_into_tokens)

In [10]:
for i in range(l):
    qid1 = train_copy.loc[i].qid1
    v = model.get_sentence_vector(str(train_copy.loc[i].question1))
    print(qid1, train_copy.loc[i].question1)
    t.add_item(qid1, v)

    qid2 = train_copy.loc[i].qid2
    v2 = model.get_sentence_vector(str(train_copy.loc[i].question2))
    print(qid2, train_copy.loc[i].question2)
    t.add_item(qid2, v2)

o', u'Apple', u'computers/laptops', u'have', u'over', u'other', u'brands']))
(1824, WordList([u'Why', u'do', u'so', u'many', u'people', u'in', u'the', u'US', u'have', u'an', u'Apple-branded', u'phone', u'or', u'laptop']))
(1825, WordList([u'What', u'is', u'the', u'first', u'moment', u'you', u'remember', u'in', u'your', u'Life']))
(1826, WordList([u'What', u'is', u'the', u'very', u'first', u'thing', u'you', u'remember', u'in', u'your', u'life']))
(1827, WordList([u'What', u'can', u'I', u'do', u'after', u'completing', u'BDS']))
(1828, WordList([u'What', u'can', u'we', u'do', u'after', u'completing', u'BDs']))
(1829, WordList([u'Does', u'all', u'Muslims', u'hate', u'Narendra', u'Modi']))
(1830, WordList([u'Why', u'Muslims', u'hate', u'Modi', u'government']))
(1831, WordList([u'I', u"'m", u'18', u'What', u'should', u'I', u'do']))
(1832, WordList([u'What', u'can', u'I', u'do', u'when', u'I', u"'m", u'18']))
(1833, WordList([u'What', u'are', u'some', u'really', u'good', u'and', u'famous', u'

In [11]:
t.build(1)

True

In [12]:
train_copy.loc[1].id

1

In [13]:
arr_dic = []
for k in range(l):
    a = t.get_nns_by_item(k+1, 3, include_distances=True)[0][1]
    b = t.get_nns_by_item(k+1, 3, include_distances=True)[1][1]
    a1 = t.get_nns_by_item(k+1, 3, include_distances=True)[0][2]
    b1 = t.get_nns_by_item(k+1, 3, include_distances=True)[1][2]
    dic = {"question": k+1, "a":a, "b":b, "a1":a1, "b1":b1}
    arr_dic.append(dic)

In [14]:
df_annoy = pd.DataFrame(arr_dic, columns=['question', 'a', 'a1', 'b', 'b1'])

In [15]:
def is_duplicate(x):
    if (x['question'] - x['a'] == 1) or (x['a'] - x['question'] == 1) :
        return 1
    else:
        return 0

In [16]:
train_copy

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,"[What, is, the, step, by, step, guide, to, inv...","[What, is, the, step, by, step, guide, to, inv...",0
1,1,3,4,"[What, is, the, story, of, Kohinoor, Koh-i-Noo...","[What, would, happen, if, the, Indian, governm...",0
2,2,5,6,"[How, can, I, increase, the, speed, of, my, in...","[How, can, Internet, speed, be, increased, by,...",0
3,3,7,8,"[Why, am, I, mentally, very, lonely, How, can,...","[Find, the, remainder, when, math, 23, 24, mat...",0
4,4,9,10,"[Which, one, dissolve, in, water, quikly, suga...","[Which, fish, would, survive, in, salt, water]",0
5,5,11,12,"[Astrology, I, am, a, Capricorn, Sun, Cap, moo...","[I, 'm, a, triple, Capricorn, Sun, Moon, and, ...",1
6,6,13,14,"[Should, I, buy, tiago]","[What, keeps, childern, active, and, far, from...",0
7,7,15,16,"[How, can, I, be, a, good, geologist]","[What, should, I, do, to, be, a, great, geolog...",1
8,8,17,18,"[When, do, you, use, シ, instead, of, し]","[When, do, you, use, instead, of, and]",0
9,9,19,20,"[Motorola, company, Can, I, hack, my, Charter,...","[How, do, I, hack, Motorola, DCX3400, for, fre...",0


In [17]:
df_annoy['is_duplicate'] = df_annoy.apply(is_duplicate, axis=1)
df_annoy

Unnamed: 0,question,a,a1,b,b1,is_duplicate
0,1,2,927,0.153015,0.388129,1
1,2,1,927,0.153015,0.339318,1
2,3,1346,1449,0.303120,0.315698,0
3,4,557,306,0.317899,0.345740,0
4,5,657,676,0.379404,0.386972,0
5,6,657,5,0.387631,0.402049,0
6,7,1307,24,0.367021,0.398071,0
7,8,1471,129,0.428876,0.508565,0
8,9,162,1562,0.315883,0.331218,0
9,10,1519,206,0.384944,0.429902,0
