# Importing and inspecting the data

We will start by importing the dataset

In [413]:
import pandas as pd
import numpy as np
# Set environment
pd.set_option('max_colwidth', 100)
pd.set_option('precision', 3)
# Read data
df_o=pd.read_csv('./data/train.csv',dtype={'question1':str,'question2':str})


#Inspect data
df_o.head()
df_o['is_duplicate'].value_counts(normalize=True)*100

0    63.08
1    36.92
Name: is_duplicate, dtype: float64

In [97]:
df_o.dtypes

id               int64
qid1             int64
qid2             int64
question1       object
question2       object
is_duplicate     int64
dtype: object

In [85]:
df_o.drop_duplicates(inplace=True)
df_o.dropna(inplace=True)

In [86]:
df_o.shape

(404288, 6)

## Split data

In [62]:
# Split dataset in training and validation dataset, 70/30
# Test dataset provided will not be used until the end, so it can be used for final validation
from sklearn.cross_validation import train_test_split
TEST_SIZE = 0.3
X_train, X_val, y_train, y_val = train_test_split(df_o.ix[:,0:4], df_o.ix[:,'is_duplicate'], test_size=TEST_SIZE, random_state=42)


# Generate features

Some ideas to work on:
1. Question lenght in words
2. Shared words
3. Use tf-idf to identify specific words on each pairs
4. Is question clause the same?
5. Syntax similarity: POS tags?
6. Shared synonims (all words or just the key words- verb, object)
7. Similarity of key words: wordnet/synnet similarity score

In [64]:
# Clean up functions
# Remove stopwords
from nltk.corpus import stopwords
from nltk import tokenize

from nltk.corpus import stopwords
import string

def clean_text(s):
    # call with a string and return a list of words excluding stop words and punctuation and lower case
    
    tok=tokenize.word_tokenize(s)
    s1=[w.lower() for w in tok if w not in stopwords.words('english')]
    
    punct=set(string.punctuation)
    s2=[w for w in s1 if w not in punct]
    return s2
    


#### Feature: different length in words

In [21]:
def dif_len(l1,l2):
    return(abs(len(l1.split())-len(l2.split())))


#### Feature: Number of common words

#### Feature: Use tf-idf with cosine similarity
The approach is to identify "vectorize" the strings using tf idf and then find the similarity between the two.
This feature will produce a number between the 2 sentences that will represent how close they are between each other


In [459]:


# Function to determine the tf idf of a list of strings

from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf(l1, l2):
    # 
    result=[]
    for X in zip(l1,l2):
        tfidf_matrix = []
        tfidf_vect=TfidfVectorizer()
        tfidf_sparse=tfidf_vect.fit_transform(X)
        tfidf_matrix=tfidf_sparse.todense()
        #print (tfidf_matrix,"*****")
        cosine_distance=cos_sim(tfidf_matrix)
        result.append(cosine_distance)
    return result

# function to identify the cosine similarity between 2 vectors
import sklearn.metrics.pairwise as metrics
def cos_sim(m):
    cos=metrics.cosine_similarity(m[0],m[1])[0,0]
    return cos

In [339]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer(stop_words='english')
tfidf_list=tfidf_vect.fit_transform(X_train['question1']).toarray()

In [338]:
q1_list=X_train['question1'].tolist()
q2_list=X_train['question2'].tolist()

In [462]:
cos_dist=tfidf(q1_list[0:100],q2_list[0:100])
Y_100=y_train[0:100]

In [464]:
list(zip(cos_dist,Y_100))

[(0.23577033983032974, 0),
 (0.91381065092944613, 1),
 (0.20469348972795967, 0),
 (0.79586433687836444, 0),
 (0.29121941856368966, 1),
 (0.31169551191229922, 1),
 (0.57273935841961987, 1),
 (0.063722333236223103, 0),
 (0.6694188517266485, 1),
 (0.0, 0),
 (0.14211771769343498, 0),
 (0.22185010606887803, 0),
 (0.42113589132819579, 0),
 (0.89010872503406635, 0),
 (0.75489691266928305, 0),
 (0.10112251282501054, 0),
 (0.33609692727625745, 0),
 (0.18976728433844914, 0),
 (0.75489691266928305, 1),
 (0.0, 0),
 (0.88312820391945235, 0),
 (0.15906444589068597, 0),
 (0.33609692727625756, 1),
 (0.34861427265775857, 1),
 (0.20064995690540438, 1),
 (0.71681174144306237, 1),
 (0.30392422712517309, 0),
 (0.18516298920548105, 0),
 (0.26725230169548991, 1),
 (0.15064018498706505, 0),
 (0.88363513889950873, 1),
 (0.15186371361376427, 0),
 (0.27140359420048249, 0),
 (0.51509784183590801, 0),
 (0.50560555887396907, 1),
 (0.33609692727625745, 0),
 (0.93822324785867173, 1),
 (0.89553241507157266, 1),
 (0.56

#### Feature: Shared synonyms

#### Feature: Similarity of key words: noun and verbs

## Add feature columns to train dataset

### tf idf

In [132]:
X_train.ix[1,'question1']

'What is the story of Kohinoor (Koh-i-Noor) Diamond?'

# Applying models

### Baseline algorithm

In [17]:
# We will take LogisticRegression as a simple algorithm to establish a baseline


### Other algorithms

In [18]:
# Flow: 
#   train model with a set of hyperparameters
#   Obtain score and iterate


# Run model on test dataset