# Quora Question Pairs 
Can you identify question pairs that have the same intent?
<br> https://www.kaggle.com/c/quora-question-pairs
<br> The goal of this project is to use natural language processing tools and advanced techniques to classify whether 
question pairs are duplicates or not. 

In [1]:
# Load the necessary packages
import os
import pandas as pd
import logging
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import wordnet
import numpy
import nltk 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Check directory
print os.getcwd()

/Users/hefei/Documents/Kaggle/Quora/Code/old


In [4]:
# Change working directory
os.chdir('/Users/hefei/Documents/Kaggle/Quora/data')

In [5]:
# Load data
print ("Load data...")
dftrain = pd.read_csv("./train.csv",encoding='utf-8').fillna("")
dftest = pd.read_csv("./test.csv",encoding='utf-8').fillna("")
print ("Done data loading")

Load data...
Done data loading


In [6]:
# Lemmatizing the data

# The Lemmatizing part of codes are from second place solution of CrowdFlower Kaggle Project
# The authors are: Mikhail Trofimov, Stanislav Semenov, Dmitry Altukhov
#########################
###  Lemmatizing part ###
#########################

logging.info('Lemmatizing')
toker = TreebankWordTokenizer()
lemmer = wordnet.WordNetLemmatizer()

def text_preprocessor(x):
    '''
    Get one string and clean\lemm it
    '''
    tmp = unicode(x)
    tmp = tmp.lower().replace('blu-ray', 'bluray').replace('wi-fi', 'wifi')
    x_cleaned = tmp.replace('/', ' ').replace('-', ' ').replace('"', '')
    tokens = toker.tokenize(x_cleaned)
    return " ".join([lemmer.lemmatize(z) for z in tokens])

# lemm question1
dftrain['question1']  = dftrain['question1'].apply(text_preprocessor)
print ("done question1 preprocess train")
dftest[ 'question1']  = dftest['question1'].apply(text_preprocessor)
print ("done question2 preprocess train")
# lemm question2
dftrain['question2']  = dftrain['question2'].apply(text_preprocessor)
print ("done question1 preprocess test")
dftest[ 'question2']  = dftest['question2'].apply(text_preprocessor)
print ("done question2 preprocess test")

y_train = dftrain['is_duplicate'] 

done question1 preprocess train
done question2 preprocess train
done question1 preprocess test
done question2 preprocess test


In [7]:
# Feature selection
# 1. common words shared ratio of question1 
# 2. common words shared ratio of question2
# common words mean the common words between question1 and question2
# ratio = number of common words/number of question1 (or question2)
# 3. the absolute value of difference of total numbers between two questions
# 4. cosine similarity between two questions 

N = dftrain.shape[0]
Ntest = dftest.shape[0]
m = 4 # number of features planned to extract 
X_train = numpy.zeros((N,m))
X_test = numpy.zeros((Ntest,m))
# First column: common words shared normalized by the total number of question1 
# Second column: common words shared normalized by the total number of question 2
# Third column: absolute value of total number difference between question1 and question2
# Fourth column: cosine similarity between two questions
print ("start feature extraction for train")
for i in range(0,N):
# Count the common words ratio shared between two questions for train data 
# Count the total number difference of words for train data
    x1 = dftrain.question1[i].split()
    list1 = map(lambda x: x.lower(), x1)
    x2 = dftrain.question2[i].split()
    list2 = map(lambda x: x.lower(), x2)
    set2 = set(list2)
    f = lambda x:x in set2
    x3 = filter(f, list1)
    cwr1 = len(x3)*1.0/(len(list1)+1)
    X_train[i,0] = cwr1
    cwr2 = len(x3)*1.0/(len(list2)+1)
    X_train[i,1] = cwr2
    wdiff = abs(len(x2)-len(x1))
    X_train[i,2] = wdiff
     
# Count the cosine similarity between two questions 
    documents = (dftrain.question1[i],dftrain.question2[i])
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    x0 = cosine_similarity(tfidf_matrix[0:1],tfidf_matrix)[0]
    X_train[i,3] = x0[1]

    
print ("done feature extraction for train")

for i in range(0,N):
# Count the common words ratio shared between two questions for train data 
# Count the total number difference of words for train data
    x1 = dftest.question1[i].split()
    list1 = map(lambda x: x.lower(), x1)
    x2 = dftest.question2[i].split()
    list2 = map(lambda x: x.lower(), x2)
    set2 = set(list2)
    f = lambda x:x in set2
    x3 = filter(f, list1)
    cwr1 = len(x3)*1.0/(len(list1)+1)
    X_test[i,0] = cwr1
    cwr2 = len(x3)*1.0/(len(list2)+1)
    X_test[i,1] = cwr2
    wdiff = abs(len(x2)-len(x1))
    X_test[i,2] = wdiff
     
# Count the cosine similarity between two questions 
    documents = (dftest.question1[i],dftest.question2[i])
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    x0 = cosine_similarity(tfidf_matrix[0:1],tfidf_matrix)[0]
    X_test[i,3] = x0[1]

print ("done feature extraction test")

start feature extraction for train
done feature extraction for train
done feature extraction test


In [8]:
# Save X_train and X_test for future use
import pandas as pd 

df = pd.DataFrame(X_train)
df.to_csv('xtrain.csv')

df = pd.DataFrame(X_test)
df.to_csv('xtest.csv')

In [9]:
# Build models using X_train and predict using X_test
# stochastic gradient descent classification
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="log").fit(X_train, y_train)
y_predict_sgd = clf.predict_proba(X_test)[:,0]  
print "done with sgd"

done with sgd


In [11]:
# Save output 
import pandas as pd 
ysub = pd.read_csv("./sample_submission.csv")
ysub['is_duplicate'] = y_predict_sgd
submission = pd.DataFrame(ysub)
submission.to_csv("./final_solution_sgd.csv",index = False)