# Importing and inspecting the data

We will start by importing the dataset

In [413]:
import pandas as pd
import numpy as np
# Set environment
pd.set_option('max_colwidth', 100)
pd.set_option('precision', 3)
# Read data
df_o=pd.read_csv('./data/train.csv',dtype={'question1':str,'question2':str})


#Inspect data
df_o.head()
df_o['is_duplicate'].value_counts(normalize=True)*100

0    63.08
1    36.92
Name: is_duplicate, dtype: float64

In [97]:
df_o.dtypes

id               int64
qid1             int64
qid2             int64
question1       object
question2       object
is_duplicate     int64
dtype: object

In [85]:
df_o.drop_duplicates(inplace=True)
df_o.dropna(inplace=True)

In [86]:
df_o.shape

(404288, 6)

## Split data

In [62]:
# Split dataset in training and validation dataset, 70/30
# Test dataset provided will not be used until the end, so it can be used for final validation
from sklearn.cross_validation import train_test_split
TEST_SIZE = 0.3
X_train, X_val, y_train, y_val = train_test_split(df_o.ix[:,0:4], df_o.ix[:,'is_duplicate'], test_size=TEST_SIZE, random_state=42)
X_train.shape
y_train.shape

# Generate features

Some ideas to work on:
1. Question lenght in words
2. Shared words
3. Use tf-idf to identify specific words on each pairs
4. Is question clause the same?
5. Syntax similarity: POS tags?
6. Shared synonims (all words or just the key words- verb, object)
7. Similarity of key words: wordnet/synnet similarity score

In [64]:
# Clean up functions
# Remove stopwords
from nltk.corpus import stopwords
from nltk import tokenize

from nltk.corpus import stopwords
import string

def clean_text(s):
    # call with a string and return a list of words excluding stop words and punctuation and lower case
    
    tok=tokenize.word_tokenize(s)
    s1=[w.lower() for w in tok if w not in stopwords.words('english')]
    
    punct=set(string.punctuation)
    s2=[w for w in s1 if w not in punct]
    return s2
    


#### Feature: different length in words

In [21]:
def dif_len(l1,l2):
    return(abs(len(l1.split())-len(l2.split())))


#### Feature: Number of common words

#### Feature: Use tf-idf with cosine similarity
The approach is to identify "vectorize" the strings using tf idf and then find the similarity between the two.
This feature will produce a number between the 2 sentences that will represent how close they are between each other


In [459]:


# Function to determine the tf idf of a list of strings

from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf(l1, l2):
    # 
    result=[]
    for X in zip(l1,l2):
        tfidf_matrix = []
        tfidf_vect=TfidfVectorizer()
        tfidf_sparse=tfidf_vect.fit_transform(X)
        tfidf_matrix=tfidf_sparse.todense()
        #print (tfidf_matrix,"*****")
        cosine_distance=cos_sim(tfidf_matrix)
        result.append(cosine_distance)
    return result

# function to identify the cosine similarity between 2 vectors
import sklearn.metrics.pairwise as metrics
def cos_sim(m):
    cos=metrics.cosine_similarity(m[0],m[1])[0,0]
    return cos

In [339]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer(stop_words='english')
tfidf_list=tfidf_vect.fit_transform(X_train['question1']).toarray()

In [338]:
q1_list=X_train['question1'].tolist()
q2_list=X_train['question2'].tolist()

In [482]:
tfidf_cos=tfidf(q1_list[:],q2_list[:])

In [483]:
type(tfidf_cos)

list

In [490]:
X_train.ix[:,'tfidf_cos']=pd.Series(tfidf_cos)

In [494]:
X_train

Unnamed: 0_level_0,qid1,qid2,question1,question2,tfidf_cos
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20128,37998,37999,"How is the working environment at SBI Life, Mumbai?",How stressful is work of SBI clerk?,0.143
185202,282774,282775,What kinds of questions should I expect on the IBM CAT/IPAT?,What kinds of questions should I expect on the IBM IPAT?,0.345
107096,176275,176276,"On WhatsApp, it says on the message info that the message has been read with 2 blue ticks and no...",My friend is abroad and I have sent him messages. There is one grey tick next to the messages I ...,0.777
27940,51849,51850,How do the holy scriptures of Hinduism compare and contrast to those of Taoism?,How do the holy scriptures of Hinduism compare and contrast to those of Italo-Roman paganism?,0.078
244713,357399,357400,Does beard transplantation really work?,Is beard transplantation worthy?,0.404
111382,34756,37759,My questions haven't changed. Why are they now being marked as needing improvement?,What should I do if my question is being marked instantly as needing improvement but I don't kno...,0.206
403957,86467,39685,Which phone should I buy under 15k?,Which phone is best to buy under 15k?,
101177,167668,167669,Should I choose MFC Delhi or make another attempt at CAT?,How many attempts did you make and what do you think was your accuracy (approx) in the CAT 14 to...,0.162
78010,133094,133095,Do we really need love?,Do people really need love?,0.372
57348,100786,100787,Frequently asked interview questions on strength of materials?,Where should we place the most important or striking words or ideas within a sentence?,0.202


#### Feature: Shared synonyms

#### Feature: Similarity of key words: noun and verbs

## Add feature columns to train dataset

### tf idf

In [132]:
X_train.ix[:,'tfdid']=

'What is the story of Kohinoor (Koh-i-Noor) Diamond?'

# Applying models

### Baseline algorithm

In [17]:
# We will take LogisticRegression as a simple algorithm to establish a baseline


### Other algorithms

In [18]:
# Flow: 
#   train model with a set of hyperparameters
#   Obtain score and iterate


# Run model on test dataset