# Kaggle Quora Challenge

## Data loading

In [1]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import numpy as np
import nltk

In [2]:
train = pd.read_csv("data/train.csv", dtype={"id":np.int32, "qid1":np.int32, "qid2":np.int32, "question1":str, "question2":str, "is_duplicate":np.int32})

In [3]:
train.describe(include="all")

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
count,404290.0,404290.0,404290.0,404290,404288,404290.0
unique,,,,290457,299174,
top,,,,How do I improve my English speaking?,How can you look at someone's private Instagra...,
freq,,,,50,120,
mean,202144.5,217243.942418,220955.655337,,,0.369198
std,116708.614502,157751.700002,159903.182629,,,0.482588
min,0.0,1.0,2.0,,,0.0
25%,101072.25,74437.5,74727.0,,,0.0
50%,202144.5,192182.0,197052.0,,,0.0
75%,303216.75,346573.5,354692.5,,,1.0


In [9]:
questions = []
ids = {}
count = 0
for i, row in tqdm(train.iterrows()):
    if row["qid1"] not in ids and row["question1"] is not np.nan:
        questions.append(row["question1"])
        ids[row["qid1"]] = count
        count += 1
    if row["qid2"] not in ids and row["question2"] is not np.nan:
        questions.append(row["question2"])
        ids[row["qid2"]] = count
        count += 1
assert len(questions) == len(ids.keys())
len(questions)    




537932

## tokenizer & stemmer

In [10]:
from nltk.stem.porter import PorterStemmer
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [11]:
for q in questions[:30]:
    print("%s => %s" % (q, " ".join(tokenize(q))))

What is the step by step guide to invest in share market in india? => What is the step by step guid to invest in share market in india ?
What is the step by step guide to invest in share market? => What is the step by step guid to invest in share market ?
What is the story of Kohinoor (Koh-i-Noor) Diamond? => What is the stori of Kohinoor ( Koh-i-Noor ) Diamond ?
What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back? => What would happen if the Indian govern stole the Kohinoor ( Koh-i-Noor ) diamond back ?
How can I increase the speed of my internet connection while using a VPN? => How can I increas the speed of my internet connect while use a VPN ?
How can Internet speed be increased by hacking through DNS? => How can Internet speed be increas by hack through DN ?
Why am I mentally very lonely? How can I solve it? => Whi am I mental veri lone ? How can I solv it ?
Find the remainder when [math]23^{24}[/math] is divided by 24,23? => Find the remaind wh

## Evaluation

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
def calculate_similarity(vectors, verbose=True):
    sim = []
    truth = []
    for i, row in tqdm(train.iterrows()):
        if row["qid1"] in ids and row["qid2"] in ids:
            pred = cosine_similarity(vectors[ids[row["qid1"]]], vectors[ids[row["qid2"]]])
            label = row["is_duplicate"]
            if verbose and i % 5000 == 0:
                print("sim:%.3f label:%s 1.%s 2.%s" % (pred, label, row["question1"], row["question2"]))
            sim.append(pred)
            truth.append(label)
    return sim, truth

## Tf-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [12]:
tfidf_transformer = TfidfVectorizer(tokenizer=tokenize, decode_error="ignore")
questions_tfidf = tfidf_transformer.fit_transform(questions)

In [15]:
questions_tfidf.shape

(537932, 94321)

In [19]:
calculate_similarity(questions_tfidf)

sim:0.974 label:0 1.What is the step by step guide to invest in share market in india? 2.What is the step by step guide to invest in share market?
sim:0.868 label:1 1.How do you recharge a AA battery? 2.How can you recharge regular AA batteries?
sim:0.784 label:0 1.Is it safe for a woman to travel alone in Vietnam? 2.Is it safe for a woman to travel alone in Japan?
sim:0.924 label:1 1.What are the best sites to download movies? 2.Which are best sites to download movies?
sim:0.050 label:0 1.What are the features of the Indian caste system? 2.What triggers you the most when you play video games?
sim:0.981 label:0 1.Does the end justify the means, or does the means justify the end? 2.Does the end justify the means?
sim:0.033 label:0 1.I am in my late 20's and I look older than I am. Need suggestions on best skincare products I can buy to begin with? 2.Is domino's pizza halal?
sim:0.811 label:1 1.How can you determine the boiling point of a substance? 2.How do you calculate the boiling poi

KeyboardInterrupt: 