# Importing and inspecting the data

We will start by importing the dataset

In [89]:
import pandas as pd
import numpy as np
# Set environment
pd.set_option('max_colwidth', 100)
pd.set_option('precision', 3)
# Read data
df_o=pd.read_csv('./data/train.csv',dtype={'question1':str,'question2':str},index_col='id')

df_o['is_duplicate'].value_counts(normalize=True)*100

0    63.08
1    36.92
Name: is_duplicate, dtype: float64

In [90]:
df_o.shape

(404290, 5)

In [91]:
df_o.dtypes

qid1             int64
qid2             int64
question1       object
question2       object
is_duplicate     int64
dtype: object

In [92]:
df_o.drop_duplicates(inplace=True)
df_o.dropna(how='any',inplace=True)

In [132]:
df_o['question1']=df_o['question1'].astype('str')
df_o['question2']=df_o['question2'].astype('str')

In [93]:
df_o.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0


## Split data

In [134]:
# Split dataset in training and validation dataset, 70/30
# Test dataset provided will not be used until the end, so it can be used for final validation
from sklearn.cross_validation import train_test_split
TEST_SIZE = 0.3
X_train, X_val, y_train, y_val = train_test_split(df_o.loc[:,'qid1':'question2'], df_o.loc[:,'is_duplicate'], test_size=TEST_SIZE, random_state=42)
X_train.shape


(283001, 4)

# Generate features

Some ideas to work on:
1. Question lenght in words
2. Shared words
3. Use tf-idf to identify specific words on each pairs
4. Is question clause the same?
5. Syntax similarity: POS tags?
6. Shared synonims (all words or just the key words- verb, object)
7. Similarity of key words: wordnet/synnet similarity score

In [12]:
# Clean up functions
# Remove stopwords
from nltk.corpus import stopwords
from nltk import tokenize

from nltk.corpus import stopwords
import string

def clean_text(s,lower=False):
    """ Tokenize text and remove stop words and punctuation
        s: string to clean
        lower: Boolean if text should be converted to lower case
    """
    
    tok=tokenize.word_tokenize(s)
    if (lower):
        s_nostop=[w.lower() for w in tok if w not in stopwords.words('english')]
    else:
        s_nostop=[w for w in tok if w not in stopwords.words('english')]
    punct=set(string.punctuation)
    s_nopunct=[w for w in s_nostop if w not in punct]
    return s_nopunct
    


#### Feature: different length in words

In [126]:
def n_words(s):
    return(len(s.split()))

In [149]:
X_train['len1']=X_train['question1'].apply(n_words)

In [150]:
X_train['len2']=X_train['question2'].apply(n_words)

In [152]:
X_train['dif_len']=X_train['len1']-X_train['len2']

In [153]:
X_train['dif_len']=X_train['dif_len'].apply(abs)

In [156]:
X_train.drop('len1',1,inplace=True)
X_train.drop('len2',1,inplace=True)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,dif_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20128,37998,37999,"How is the working environment at SBI Life, Mumbai?",How stressful is work of SBI clerk?,2
185202,282774,282775,What kinds of questions should I expect on the IBM CAT/IPAT?,What kinds of questions should I expect on the IBM IPAT?,0
107096,176275,176276,"On WhatsApp, it says on the message info that the message has been read with 2 blue ticks and no...",My friend is abroad and I have sent him messages. There is one grey tick next to the messages I ...,15
27940,51849,51850,How do the holy scriptures of Hinduism compare and contrast to those of Taoism?,How do the holy scriptures of Hinduism compare and contrast to those of Italo-Roman paganism?,1
244713,357399,357400,Does beard transplantation really work?,Is beard transplantation worthy?,1


#### Feature: Number of common words

#### Feature: Use tf-idf with cosine similarity
The approach is to identify "vectorize" the strings using tf idf and then find the similarity between the two.
This feature will produce a number between the 2 sentences that will represent how close they are between each other


In [459]:


# Function to determine the tf idf of a list of strings

from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf(l1, l2):
    # 
    result=[]
    for X in zip(l1,l2):
        tfidf_matrix = []
        tfidf_vect=TfidfVectorizer()
        tfidf_sparse=tfidf_vect.fit_transform(X)
        tfidf_matrix=tfidf_sparse.todense()
        #print (tfidf_matrix,"*****")
        cosine_distance=cos_sim(tfidf_matrix)
        result.append(cosine_distance)
    return result

# function to identify the cosine similarity between 2 vectors
import sklearn.metrics.pairwise as metrics
def cos_sim(m):
    cos=metrics.cosine_similarity(m[0],m[1])[0,0]
    return cos

In [339]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer(stop_words='english')
tfidf_list=tfidf_vect.fit_transform(X_train['question1']).toarray()

In [338]:
q1_list=X_train['question1'].tolist()
q2_list=X_train['question2'].tolist()

In [482]:
tfidf_cos=tfidf(q1_list[:],q2_list[:])

In [483]:
type(tfidf_cos)

list

In [490]:
X_train.ix[:,'tfidf_cos']=pd.Series(tfidf_cos)

In [494]:
X_train

Unnamed: 0_level_0,qid1,qid2,question1,question2,tfidf_cos
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20128,37998,37999,"How is the working environment at SBI Life, Mumbai?",How stressful is work of SBI clerk?,0.143
185202,282774,282775,What kinds of questions should I expect on the IBM CAT/IPAT?,What kinds of questions should I expect on the IBM IPAT?,0.345
107096,176275,176276,"On WhatsApp, it says on the message info that the message has been read with 2 blue ticks and no...",My friend is abroad and I have sent him messages. There is one grey tick next to the messages I ...,0.777
27940,51849,51850,How do the holy scriptures of Hinduism compare and contrast to those of Taoism?,How do the holy scriptures of Hinduism compare and contrast to those of Italo-Roman paganism?,0.078
244713,357399,357400,Does beard transplantation really work?,Is beard transplantation worthy?,0.404
111382,34756,37759,My questions haven't changed. Why are they now being marked as needing improvement?,What should I do if my question is being marked instantly as needing improvement but I don't kno...,0.206
403957,86467,39685,Which phone should I buy under 15k?,Which phone is best to buy under 15k?,
101177,167668,167669,Should I choose MFC Delhi or make another attempt at CAT?,How many attempts did you make and what do you think was your accuracy (approx) in the CAT 14 to...,0.162
78010,133094,133095,Do we really need love?,Do people really need love?,0.372
57348,100786,100787,Frequently asked interview questions on strength of materials?,Where should we place the most important or striking words or ideas within a sentence?,0.202


In [161]:
X_trainainain['tfidf_cos'].fillna(0.0)

KeyError: 'tfidf_cos'

#### Feature: Shared synonyms

#### Feature: Ratio of common nouns (using Jaccard distance)

In [157]:
import nltk
def retrieve_word_pos_nltk(q,pos='N'):
    wh=[]
    tagged=nltk.pos_tag(nltk.word_tokenize(q))
    return [item[0] for item in tagged if item[1][0] == 'N']


In [190]:
def jaccard_dis(l1,l2):
    num=len(set(l1).intersection(l2))
    den=len(set(l1).union(l2))
    if den==0:
        return 0
    else:
        return(num/den)

In [173]:
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,dif_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20128,37998,37999,"How is the working environment at SBI Life, Mumbai?",How stressful is work of SBI clerk?,2
185202,282774,282775,What kinds of questions should I expect on the IBM CAT/IPAT?,What kinds of questions should I expect on the IBM IPAT?,0
107096,176275,176276,"On WhatsApp, it says on the message info that the message has been read with 2 blue ticks and no...",My friend is abroad and I have sent him messages. There is one grey tick next to the messages I ...,15
27940,51849,51850,How do the holy scriptures of Hinduism compare and contrast to those of Taoism?,How do the holy scriptures of Hinduism compare and contrast to those of Italo-Roman paganism?,1
244713,357399,357400,Does beard transplantation really work?,Is beard transplantation worthy?,1


In [183]:
X_train['nouns1']=X_train.loc[:,'question1'].apply(retrieve_word_pos_nltk)

In [187]:
X_train['nouns2']=X_train.loc[:,'question2'].apply(retrieve_word_pos_nltk)

In [188]:
q1_list=X_train['nouns1'].tolist()
q2_list=X_train['nouns2'].tolist()

In [206]:
dis=[]
for i in range (len(q1_list)):
    if i==185202:
        print(q1_list[i],q2_list[i])
    dis.append(jaccard_dis(q1_list[i],q2_list[i]))

['ice'] ['ice']


In [201]:
X_train.drop('ration_nouns',axis=1,inplace=True)

In [202]:
X_train['ratio_nouns']=pd.Series(dis)

In [203]:
jaccard_dis(X_train.loc[20128,'nouns1'],X_train.loc[20128,'nouns2'])

0.16666666666666666

In [204]:
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,dif_len,nouns1,nouns2,ratio_nouns
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20128,37998,37999,"How is the working environment at SBI Life, Mumbai?",How stressful is work of SBI clerk?,2,"[environment, SBI, Life, Mumbai]","[work, SBI, clerk]",0.125
185202,282774,282775,What kinds of questions should I expect on the IBM CAT/IPAT?,What kinds of questions should I expect on the IBM IPAT?,0,"[questions, IBM, CAT/IPAT]","[questions, IBM, IPAT]",1.0
107096,176275,176276,"On WhatsApp, it says on the message info that the message has been read with 2 blue ticks and no...",My friend is abroad and I have sent him messages. There is one grey tick next to the messages I ...,15,"[WhatsApp, message, info, message, ticks, time, message, grey, checks, time, mean]","[friend, messages, grey, messages, read, ticks, time, read, messages]",0.5
27940,51849,51850,How do the holy scriptures of Hinduism compare and contrast to those of Taoism?,How do the holy scriptures of Hinduism compare and contrast to those of Italo-Roman paganism?,1,"[holy, scriptures, Hinduism, compare, contrast, Taoism]","[holy, scriptures, Hinduism, compare, contrast, Italo-Roman, paganism]",0.0
244713,357399,357400,Does beard transplantation really work?,Is beard transplantation worthy?,1,"[Does, transplantation]","[transplantation, worthy]",1.0


## Add feature columns to train dataset

### tf idf

In [132]:
X_train.ix[:,'tfdid']=

'What is the story of Kohinoor (Koh-i-Noor) Diamond?'

# Applying models

### Baseline algorithm

In [17]:
# We will take LogisticRegression as a simple algorithm to establish a baseline


### Other algorithms

In [18]:
# Flow: 
#   train model with a set of hyperparameters
#   Obtain score and iterate


# Run model on test dataset