In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from collections import Counter
import math, random

%matplotlib inline

In [2]:
def cumulative(n):
    s = 0
    n = sorted(n)
    cx, cy = [], []
    last_n = None
    for ni in n:
        s += 1
        if ni != last_n:
            last_n = ni
            cx.append(ni)
            cy.append(s)
        else:
            cy[-1] = s
    x = np.array(cx)
    y = np.array(cy, dtype=np.float32)/s
    return x, y

def plot_cumulative(x):
    cx, cy = cumulative(x)

    ticks_major = np.arange(0.0, 1.1, 0.1)
    ticks_minor = np.arange(0.0, 1.1, 0.05)

    xticks = np.arange(0.0, 200.0, 10)

    fig = plt.figure()                                                               
    ax = fig.add_subplot(1,1,1)                                                      

    ax.plot(cx, cy)
    ax.set_xticks(xticks)                                                       
    ax.set_yticks(ticks_major)                                                       
    ax.set_yticks(ticks_minor, minor=True)                                                       
    ax.grid(True, which="both")
    plt.xscale("log")
    plt.show()

In [3]:
MaxQuestionLength = 40   # words
MaxMissingWords = 2*MaxQuestionLength

def normalize_question(q):
    q = q.lower()
    q = q.replace("?"," ? ")
    q = q.replace("..."," . ")
    q = q.replace(".."," . ")
    q = q.replace("."," . ")
    q = q.replace(":"," : ")
    q = q.replace(","," , ")
    q = q.replace("/"," / ")
    q = q.replace("("," ( ")
    q = q.replace(")"," ) ")
    q = q.replace("n't"," not ")    
    q = q.replace("i'm", "i am")
    q = q.replace("[math]", " [math] ")
    q = q.replace("[/math]", " [/math] ")
    q = q.replace("-"," - ")
    q = q.replace('"',' " ')
    q = q.replace('\xe2\x80\x9c', ' " ').replace('\xe2\x80\x9d', ' " ')    
    q = q.replace("'s", " 's ")
    words = q.lower().split()[:MaxQuestionLength]
    #words = map(lambda x: x.strip(), words)
    #words = filter(lambda x: len(x) > 0, words)
    #words = map(lambda w: w[:-1] + " s" if len(w)>3 and w[-1]=='s' and 
    #            not w in ("this", "does") else w, words)
    return " ".join(words)

def word_frequencies(questions):
    words = " ".join(questions).split(" ")
    counter = Counter(words)
    return sorted(counter.items(), key=lambda x:-x[1])




In [4]:
train_df=pd.read_csv("/Users/igorvm/Projects/Neural/questions/data/train.csv")
train_df.fillna("", inplace=True)
print "Train set loaded:", len(train_df)

test_df = pd.read_csv("/Users/igorvm/Projects/Neural/questions/data/test.csv")
test_df.fillna("", inplace=True)
print "Test set loaded:", len(test_df)


Train set loaded: 404290
Test set loaded: 2345796


In [5]:

test_questions = map(normalize_question, test_df["question1"].tolist() + test_df["question2"].tolist())
train_questions = map(normalize_question, train_df["question1"].tolist() + train_df["question2"].tolist())

print "questions normalized"



questions normalized


In [6]:
test_questions = set(test_questions)
train_questions = set(train_questions)
all_questions = test_questions | train_questions


In [7]:
for i, q in enumerate(test_questions):
    print i, q
    if i > 100: break

0 
1 what is an sell positive blood group and what are the characteristics ?
2 what is the weirdest me thing you've ever done to talk with a girl / guy you like ?
3 what are the side effects ( positive and negative ) , if any , of hit ?
4 what are good sides for macaroni and locations ?
5 what are grow best in the uk ?
6 how can you determine the combustion of methane ?
7 are there some real websites that offer real fund for part - time work like data - entry , ad - posting jobs ? with minimum registration fees of course . ?
8 what there any permanent treatment for tinnitus ?
9 what is the significance of free trade agreement among a group of nations ?
10 are there any is the best website to download english songs for an android phone ?
11 what from a university vpn , then can the mpaa / riaa still find a way to track you down ?
12 how do you improve your design skills as designer ?
13 why does telugu actor allu arjun have more fans in tamil nadu than in andhra pradesh ?
14 my boyfrien

In [8]:
all_frequencies = word_frequencies(all_questions)
#train_frequencies = word_frequencies(train_questions)


In [9]:
print "Total number of questions:", len(all_questions)
#print "Different words in train questions:", len(train_frequencies)
print "Different words in all questions:", len(all_frequencies)

Total number of questions: 4783352
Different words in all questions: 138927


In [10]:
VocabularySize = 10000

In [11]:
vocabulary_words = [w for w, n in all_frequencies[:VocabularySize]]
vocabulary_encoding = {w:i+1 for i, w in enumerate(vocabulary_words)}
vocabulary_decoding = [None] + vocabulary_words
vocabulary_set = set(vocabulary_words)

In [12]:
encoding_df = pd.DataFrame(range(1, 1+VocabularySize), index=vocabulary_words, columns=["encoding"])
encoding_df.head()

Unnamed: 0,encoding
?,1
the,2
what,3
is,4
i,5


In [13]:
decoding_df = pd.DataFrame(vocabulary_words, index = range(1, 1+VocabularySize), columns=["decoding"])
decoding_df.head()

Unnamed: 0,decoding
1,?
2,the
3,what
4,is
5,i


In [14]:
def encode_question(q):
    # question is already normalized
    words = q.split()
    wset = set(words)
    missing_words = wset - vocabulary_set
    missing_words_encoding = {w:VocabularySize+1+random.randint(1,MaxMissingWords-1) 
                              for i, w in enumerate(missing_words)}
    encoded = np.array([vocabulary_encoding[w] 
            if w in vocabulary_set
            else missing_words_encoding[w]           
            for w in words], 
        dtype=np.uint16)
    return encoded

def decode_question(words):
    return " ".join(map(lambda x: vocabulary_decoding[x] if x <= VocabularySize else "<%d>" % (x,), words))

def permute_encoded_question(q):
    if len(q) <= 0: return q
    q = q.copy()
    n = len(q)
    m = max(1, n/3)
    r = range(n)
    for i in random.sample(r, m):
        q[i] = random.randint(1, VocabularySize)
    for _ in xrange(n/10):
        i,j = random.sample(r, 2)
        q[i], q[j] = q[j], q[i]
    return q


In [15]:
for i, q in enumerate(all_questions):
    encoded = encode_question(q)
    decoded = decode_question(encoded)
    print q
    #print encoded
    print decoded
    print
    if i > 30: break




what is an sell positive blood group and what are the characteristics ?
what is an sell positive blood group and what are the characteristics ?

what are good sides for macaroni and locations ?
what are good sides for <10017> and locations ?

my boyfriend started ignoring me for no reason . should i do ?
my boyfriend started ignoring me for no reason . should i do ?

in the summer how would i dress like a workout guy from the year 1990 ?
in the summer how would i dress like a workout guy from the year 1990 ?

why did not the jews build a jewish nation - state in overthrow empty area ?
why did not the jews build a jewish nation - state in <10079> empty area ?

what " electrosmog " kind of coding questions are asked in samsung 's online competency test ?
what " <10025> " kind of coding questions are asked in samsung 's online <10020> test ?

how many nukes does north korea have ?
how many nukes does north korea have ?

will it be a good idea to do mba app if you are earning 8 lpa and 

In [16]:
good_sentences = map(encode_question, all_questions)
for s in good_sentences[:100]:
    print decode_question(s)


what is an sell positive blood group and what are the characteristics ?
what are good sides for <10016> and locations ?
my boyfriend started ignoring me for no reason . should i do ?
in the summer how would i dress like a workout guy from the year 1990 ?
why did not the jews build a jewish nation - state in <10008> empty area ?
what " <10077> " kind of coding questions are asked in samsung 's online <10069> test ?
how many nukes does north korea have ?
will it be a good idea to do mba app if you are earning 8 lpa and having <10065> experience ?
how do i form real estate investing fund for moral investing ?
how do i teach english abroad if i am not a native and have no degree ?
why ca not the religion and the caste columns in all pakistanis sort of application forms be removed ? wo not it bring social equality ?
what is sick android studio , eclipse , or <10041> for android development ?
why do not <10072> and <10037> pair up for mixed <10032> ?
how do get achieve world peace ?
how can

In [17]:
bad_sentences = map(permute_encoded_question, good_sentences)
for s in bad_sentences[:100]:
    print decode_question(s)


fields is an pirates core blood group 89 what are the ? characteristics
what are good sides windows ielts and locations economical
my boyfriend pills ignoring me for motors mold should . i motivational ?
in the summer how continuing diet business like a would guy from the gives 1990 flow
why rational not the karate build a usb nation - state jewish <10008> empty emotionally scottish
asked rabbits <10077> " kind of depressed identity are what in learning 's online 8% test ?
attached many nukes does north korea download ?
inner it integral member good idea to do pitbull app accounts you are earning 8 strictly and having ? antivirus <10065>
move do i form semiconductor estate investing how for geforce wont ?
plastic infection i how english abroad if i am not 4g native and nights no discontinuing ?
the ca celsius community religion and crusher caste columns in all recession theta of application hobbies be removed five wo height it bring social equality ?
what asus nato edit development , 

In [18]:
sentences = [(s, 1) for s in good_sentences] + [(s, 0) for s in bad_sentences]
random.shuffle(sentences)

sentences_series = pd.Series([s for s, c in sentences])
class_series = pd.Series([c for s, c in sentences], dtype=np.uint8)
sentences_df = pd.DataFrame()
sentences_df["sentence"] = sentences_series
sentences_df["classification"] = class_series

print "df head"
print sentences_df.head()

print "sentence"
print sentences_df["sentence"][:10]

print "classification"
print sentences_df["classification"][:10]





df head
                                            sentence  classification
0                  [17, 12, 231, 3005, 24, 10018, 1]               1
1  [7, 11, 5, 685, 21, 8558, 853, 156, 1191, 9, 6...               1
2           [17, 23, 21, 2487, 1274, 28, 5, 2356, 1]               1
3    [3, 12, 10019, 2, 6466, 4775, 445, 24, 3043, 1]               0
4                    [118, 603, 35, 154, 6724, 9864]               0
sentence
0                    [17, 12, 231, 3005, 24, 10018, 1]
1    [7, 11, 5, 685, 21, 8558, 853, 156, 1191, 9, 6...
2             [17, 23, 21, 2487, 1274, 28, 5, 2356, 1]
3      [3, 12, 10019, 2, 6466, 4775, 445, 24, 3043, 1]
4                      [118, 603, 35, 154, 6724, 9864]
5    [3, 20, 487, 14, 6, 1179, 105, 136, 131, 9, 34...
6    [3, 615, 23, 1360, 5321, 1572, 101, 1753, 40, ...
7        [7, 11, 5, 450, 13, 958, 2, 2447, 200, 93, 1]
8    [7, 11, 6863, 10008, 3604, 1269, 958, 9, 80, 2...
9    [15, 2, 279, 2633, 10048, 1172, 349, 324, 121,...
Name: sentence, dty

In [60]:
store = pd.HDFStore("/Users/igorvm/Projects/Neural/questions/data/sentences_train.hd5")
store["decoding"] = decoding_df
store["sentences"] = sentences_df
store.close()



[128   2  99 ..., 116  98  46] 133859
[128   2  99 ..., 116  98  46] 533280335


In [61]:
vocabulary_file = open("/Users/igorvm/Projects/Neural/questions/data/vocabulary.tsv", "w")
vocabulary_file.write("word\tcode\n<none>\t0\n")
for i, w in decoding_df.itertuples():
    vocabulary_file.write("%s\t%d\n" % (w, i))
for i in range(MaxMissingWords):
    vocabulary_file.write("<%d>\t%d\n" % (VocabularySize+i+1,VocabularySize+i+1))
vocabulary_file.close()

In [19]:
encoding_store = pd.HDFStore("/Users/igorvm/Projects/Neural/questions/data/encoding.hd5")
encoding_store["encoding"] = encoding_df
encoding_store.close()


In [39]:
def encode_question_from_pair(q_words, match_set, encoding, unknown_encoding):
    return (
        np.array(
            [
                encoding[w] if w in encoding else unknown_encoding[w]
                for w in q_words
            ], dtype = np.uint16
        ),
        np.array(
            [w in match_set for w in q_words], dtype=np.uint8
        )
    )
    

def encode_pair(q1, q2, encoding_set, encoding, unknown_vocabulary):
    # q1 and q2 are unnormalized questions
    
    q1 = normalize_question(q1)
    q2 = normalize_question(q2)
    q1_words = q1.split()
    q2_words = q2.split()
    q1_words_set = set(q1_words)
    q2_words_set = set(q2_words)
    match = q1_words_set & q2_words_set
    union = q1_words_set | q2_words_set
    unknown = list(union - encoding_set)
    unknown_vocabulary = random.sample(unknown_vocabulary, len(unknown))
    unknown_encoding = {w: unknown_vocabulary[i] for i, w in enumerate(unknown)}
    
    q1_encoded = encode_question_from_pair(q1_words, match, encoding, unknown_encoding)
    q2_encoded = encode_question_from_pair(q2_words, match, encoding, unknown_encoding)
    return q1_encoded + q2_encoded
    


In [22]:
train_df.columns

Index([u'id', u'qid1', u'qid2', u'question1', u'question2', u'is_duplicate'], dtype='object')

In [23]:
train_df.index

RangeIndex(start=0, stop=404290, step=1)

In [24]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [40]:
unknown_vocabulary = range(VocabularySize+1, VocabularySize+1+MaxMissingWords+1)

encoded_pairs = [
    (pid, is_dup) + encode_pair(q1, q2, vocabulary_set, vocabulary_encoding, unknown_vocabulary)
    for _, pid, qid1, qid2, q1, q2, is_dup in train_df.itertuples()
]

In [41]:
encoded_pairs[:10]

[(0,
  0,
  array([   3,    4,    2, 1295,   69, 1295, 2870,    9,  587,    8,  761,
          362,    8,   43,    1], dtype=uint16),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=uint8),
  array([   3,    4,    2, 1295,   69, 1295, 2870,    9,  587,    8,  761,
          362,    1], dtype=uint16),
  array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=uint8)),
 (1, 0, array([    3,     4,     2,   545,    10, 10030,    42, 10011,    30,
             5,    30, 10074,    41,  4654,     1], dtype=uint16), array([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=uint8), array([    3,    55,   226,    28,     2,    96,   330, 10041,     2,
         10030,    42, 10011,    30,     5,    30, 10074,    41,  4654,
           199,     1], dtype=uint16), array([1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=uint8)),
 (2,
  0,
  array([   7,   15,    5,  332,    2,  432,   10,   21,  344, 1700,  156,
          133,    6, 2917,    1], dtype=uint16),
  array([1,

In [43]:
encoded_pairs_df = pd.DataFrame(encoded_pairs, columns=["piq","is_dup","q1","q1_match","q2","q2_match"])

In [44]:
encoded_pairs_df.head(20)

Unnamed: 0,piq,is_dup,q1,q1_match,q2,q2_match
0,0,0,"[3, 4, 2, 1295, 69, 1295, 2870, 9, 587, 8, 761...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]","[3, 4, 2, 1295, 69, 1295, 2870, 9, 587, 8, 761...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,1,0,"[3, 4, 2, 545, 10, 10030, 42, 10011, 30, 5, 30...","[1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[3, 55, 226, 28, 2, 96, 330, 10041, 2, 10030, ...","[1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,0,"[7, 15, 5, 332, 2, 432, 10, 21, 344, 1700, 156...","[1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1]","[7, 15, 344, 432, 34, 3439, 69, 1931, 224, 805...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]"
3,3,0,"[17, 56, 5, 3472, 277, 3369, 1, 7, 15, 5, 678,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]","[93, 2, 4893, 45, 558, 10035, 51, 566, 4, 2573...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,4,0,"[27, 57, 5745, 8, 203, 10037, 1565, 16, 2238, ...","[1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]","[27, 1542, 55, 1385, 8, 2238, 203, 1]","[1, 0, 0, 0, 1, 1, 1, 1]"
5,5,1,"[2977, 62, 5, 56, 6, 9044, 928, 4650, 837, 13,...","[0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, ...","[5, 56, 6, 6026, 9044, 42, 928, 16, 837, 13, 8...","[1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, ..."
6,6,0,"[36, 5, 129, 10073, 1]","[0, 0, 0, 0, 1]","[3, 2443, 10048, 1362, 13, 1113, 40, 130, 13, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
7,7,1,"[7, 15, 5, 34, 6, 49, 10030, 1]","[0, 0, 1, 1, 1, 0, 1, 1]","[3, 36, 5, 11, 9, 34, 6, 363, 10030, 1]","[0, 0, 1, 0, 0, 1, 1, 0, 1, 1]"
8,8,0,"[45, 11, 19, 80, 10064, 478, 10, 10040, 1]","[1, 1, 1, 1, 0, 1, 1, 0, 1]","[45, 11, 19, 80, 25, 240, 25, 478, 10, 25, 13,...","[1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1]"
9,9,0,"[7134, 42, 172, 41, 62, 15, 5, 644, 21, 10027,...","[1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1]","[7, 11, 5, 644, 7134, 10064, 14, 153, 344, 1]","[0, 0, 1, 1, 1, 1, 0, 0, 0, 1]"


In [47]:
encoded_pairs_store = pd.HDFStore("/Users/igorvm/Projects/Neural/questions/data/pairs_train.hd5")
encoded_pairs_store["pairs"] = encoded_pairs_df
encoded_pairs_store["decoding"] = decoding_df
encoded_pairs_store.close()


[128   2  99 ..., 116  98  46] 79199709
[128   2  99 ..., 116  98  46] 133859
