In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from collections import Counter
import math, random

%matplotlib inline

In [75]:
MaxQuestionLength = 40   # words
MaxMissingWords = 2*MaxQuestionLength

def normalize_question(q):
    q = q.lower()
    q = q.replace("?"," ? ")
    q = q.replace("..."," . ")
    q = q.replace(".."," . ")
    q = q.replace("."," . ")
    q = q.replace(":"," : ")
    q = q.replace(","," , ")
    q = q.replace("[math]", " [math] ")
    q = q.replace("/"," / ")
    q = q.replace("[ / math]", " [/math] ")
    q = q.replace("{"," { ")
    q = q.replace("}"," } ")
    q = q.replace("("," ( ")
    q = q.replace(")"," ) ")
    q = q.replace("^"," ^ ")
    q = q.replace("n't"," not ")    
    q = q.replace("i'm", "i am")
    q = q.replace("-"," - ")
    q = q.replace('"',' " ')
    q = q.replace('\xe2\x80\x9c', ' " ').replace('\xe2\x80\x9d', ' " ')    
    q = q.replace("'s", " 's ")
    words = q.lower().split()[:MaxQuestionLength]
    #words = map(lambda x: x.strip(), words)
    #words = filter(lambda x: len(x) > 0, words)
    #words = map(lambda w: w[:-1] + " s" if len(w)>3 and w[-1]=='s' and 
    #            not w in ("this", "does") else w, words)
    return " ".join(words)


In [None]:
def vectorize_pair(q1, q2):
    # q1 and q2 are unnormalized questions
    
    q1 = normalize_question(q1)
    q2 = normalize_question(q2)
    q1_words = q1.split()
    q2_words = q2.split()
    q1_words_set = set(q1_words)
    q2_words_set = set(q2_words)
    union = q1_words_set | q2_words_set
    encoding = { w: "<%d>" % (i,) for i, w in enumerate(union) }
    q1_encoded = " ".join([encoding[w] for w in q1_words])
    q2_encoded = " ".join([encoding[w] for w in q2_words])
    return q1_encoded, q2_encoded

def tfidSimilarity(tuples):
    # tuples are expected to be (q1, q2)
    qset = set((q1 for q1, q2 in tuples)) | set((q2 for q1, q2 in tuples))
    qlist = list(qset)
    qdict = { q:i for i, q in enumerate(qlist) }

    vectorizer = TfidfVectorizer(min_df=2, stop_words = 'english',\
        strip_accents = 'unicode', lowercase=True, ngram_range=(1,2),\
        norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

    X = vectorizer.fit_transform(qlist)
    return np.array(
        
    )
        
    ]
    

In [76]:
normalize_question("Find the remainder when [math]23^{24}[/math] is divided by 24,23?")

'find the remainder when [math] 23 ^ { 24 } [/math] is divided by 24 , 23 ?'

In [4]:
vectorizer = TfidfVectorizer(min_df=2, stop_words = 'english',\
strip_accents = 'unicode', lowercase=True, ngram_range=(1,2),\
norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)


In [11]:
train_df=pd.read_csv("/Users/ivm/ivm/Projects/Neural/kaggle/questions/data/train.csv")
train_df.fillna("", inplace=True)
print "Train set loaded:", len(train_df)

questions_indexed = { qid:normalize_question(q) for _, qid, q in train_df[["qid1","question1"]].itertuples() }
questions_indexed.update( { qid:normalize_question(q) for _, qid, q in train_df[["qid2","question2"]].itertuples() } )

questions_normalized = sorted(questions_indexed.items())

questions_normalized_df = pd.DataFrame([q for i, q in questions_normalized], columns=["question"], 
                                       index=[i for i, q in questions_normalized])



Train set loaded: 404290


In [40]:
train_questions = questions_normalized_df["question"]

def word_frequencies(questions):
    words = " ".join(questions).split(" ")
    counter = Counter(words)
    return sorted(counter.items(), key=lambda x:-x[1])

train_words_frequencies = word_frequencies(train_questions)
print "total train words:", len(train_words_frequencies)

 total train words: 96140


In [44]:
NMostFrequent = 10000
most_frequent_words = [w for w, n in train_words_frequencies[:NMostFrequent]]
most_frequent_words_set = set(most_frequent_words)

InfrequentWords = ["<%d>" % (NMostFrequent+k+1,) for k in range(MaxMissingWords)]

In [60]:
def reduce_question_from_pair(q_words, frequent_set, infrequent_encoding):
    return [w if w in frequent_set else infrequent_encoding[w] for w in q_words]
    

def reduce_pair(q1, q2, frequent_set, infrequent_vocabulary):
    # q1 and q2 are unnormalized questions
    
    q1 = normalize_question(q1)
    q2 = normalize_question(q2)
    q1_words = q1.split()
    q2_words = q2.split()
    q1_words_set = set(q1_words)
    q2_words_set = set(q2_words)
    union = q1_words_set | q2_words_set
    #print "union:", union
    infrequent = union - frequent_set
    #print "infrequent:", infrequent
    unknown_vocabulary = random.sample(infrequent_vocabulary, len(infrequent))
    unknown_encoding = {w: unknown_vocabulary[i] for i, w in enumerate(infrequent)}
    
    q1_encoded = reduce_question_from_pair(q1_words, frequent_set, unknown_encoding)
    q2_encoded = reduce_question_from_pair(q2_words, frequent_set, unknown_encoding)
    return " ".join(q1_encoded), " ".join(q2_encoded)


In [61]:
reduced_pairs_df = pd.DataFrame([
    reduce_pair(q1, q2, most_frequent_words_set, InfrequentWords)+(qid1, qid2, is_dup)
        for _, q1, q2, qid1, qid2, is_dup in train_df[["question1","question2","qid1","qid2","is_duplicate"]].itertuples()
], columns=["question1", "question2","qid1","qid2","is_duplicate"])

In [62]:
reduced_pairs_df.head(10)

Unnamed: 0,question1,question2,qid1,qid2,is_duplicate
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,1,2,0
1,what is the story of <10015> ( <10053> - i - <...,what would happen if the indian government <10...,3,4,0
2,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,5,6,0
3,why am i mentally very lonely ? how can i solv...,find the remainder when [math] <10005> / math]...,7,8,0
4,"which one dissolve in water <10064> sugar , sa...",which fish would survive in salt water ?,9,10,0
5,astrology : i am a capricorn sun cap moon and ...,"i am a triple capricorn ( sun , moon and ascen...",11,12,1
6,should i buy <10033> ?,what keeps <10061> active and far from phone a...,13,14,0
7,how can i be a good <10027> ?,what should i do to be a great <10027> ?,15,16,1
8,when do you use <10035> instead of <10059> ?,"when do you use "" & "" instead of "" and "" ?",17,18,0
9,motorola ( company ) : can i hack my charter <...,how do i hack motorola <10025> for free intern...,19,20,0


In [63]:
reduced_questions_indexed = { qid:q for _, qid, q in reduced_pairs_df[["qid1","question1"]].itertuples() }
reduced_questions_indexed.update( { qid:q for _, qid, q in reduced_pairs_df[["qid2","question2"]].itertuples() } )

reduced_questions_normalized = sorted(reduced_questions_indexed.items())

reduced_questions_normalized_df = pd.DataFrame([q for i, q in reduced_questions_normalized], columns=["question"], 
                                       index=[i for i, q in reduced_questions_normalized])



In [64]:
X_reduced = vectorizer.fit_transform(reduced_questions_normalized_df["question"])

In [66]:
reduced_distances_and_dups = [
    (X_reduced[qid1-1].toarray().dot(X_reduced[qid2-1].toarray().T)[0][0], is_dup)
    for _, qid1, qid2, is_dup in train_df[["qid1","qid2","is_duplicate"]].itertuples()
]

In [67]:
reduced_distances_and_dups = np.array(reduced_distances_and_dups)
reduced_loss = -np.mean(np.log(reduced_distances_and_dups[:,0]+0.00001)*reduced_distances_and_dups[:,1])
print loss

0.299349403229


In [13]:
X = vectorizer.fit_transform(questions_normalized_df["question"])


In [21]:
X[0].toarray().dot(X[4].toarray().T)

array([[ 0.]])

In [23]:
train_df

Index([u'id', u'qid1', u'qid2', u'question1', u'question2', u'is_duplicate'], dtype='object')

In [24]:
distances_and_dups = [
    (X[qid1-1].toarray().dot(X[qid2-1].toarray().T)[0][0], is_dup)
    for _, qid1, qid2, is_dup in train_df[["qid1","qid2","is_duplicate"]].itertuples()
]

In [32]:
distances_df = pd.DataFrame(distances_and_dups, columns=["distance", "is_dup"])
distances_df["rounded_distance"] = map(round, distances_df["distance"])
distances_df.head(20)

Unnamed: 0,distance,is_dup,rounded_distance
0,0.94108,0,1.0
1,0.814216,0,1.0
2,0.121902,0,0.0
3,0.0,0,0.0
4,0.119912,0,0.0
5,0.579761,1,1.0
6,0.0,0,0.0
7,0.824822,1,1.0
8,1.0,0,1.0
9,0.659802,0,1.0


In [38]:
loss = -np.mean(np.log(distances_df["distance"]+0.00001)*distances_df["is_dup"])
print loss

0.299349403229
