In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from collections import Counter
import math, random

%matplotlib inline

In [2]:
def cumulative(n):
    s = 0
    n = sorted(n)
    cx, cy = [], []
    last_n = None
    for ni in n:
        s += 1
        if ni != last_n:
            last_n = ni
            cx.append(ni)
            cy.append(s)
        else:
            cy[-1] = s
    x = np.array(cx)
    y = np.array(cy, dtype=np.float32)/s
    return x, y

def plot_cumulative(x):
    cx, cy = cumulative(x)

    ticks_major = np.arange(0.0, 1.1, 0.1)
    ticks_minor = np.arange(0.0, 1.1, 0.05)

    xticks = np.arange(0.0, 200.0, 10)

    fig = plt.figure()                                                               
    ax = fig.add_subplot(1,1,1)                                                      

    ax.plot(cx, cy)
    ax.set_xticks(xticks)                                                       
    ax.set_yticks(ticks_major)                                                       
    ax.set_yticks(ticks_minor, minor=True)                                                       
    ax.grid(True, which="both")
    plt.xscale("log")
    plt.show()

In [45]:
MaxQuestionLength = 40   # words
MaxMissingWords = 2*MaxQuestionLength

def normalize_question(q):
    q = q.lower()
    q = q.replace("?"," ? ")
    q = q.replace("..."," . ")
    q = q.replace(".."," . ")
    q = q.replace("."," . ")
    q = q.replace(":"," : ")
    q = q.replace(","," , ")
    q = q.replace("/"," / ")
    q = q.replace("("," ( ")
    q = q.replace(")"," ) ")
    q = q.replace("n't"," not ")    
    q = q.replace("i'm", "i am")
    q = q.replace("[math]", " [math] ")
    q = q.replace("[/math]", " [/math] ")
    q = q.replace("-"," - ")
    q = q.replace('"',' " ')
    q = q.replace('\xe2\x80\x9c', ' " ').replace('\xe2\x80\x9d', ' " ')    
    q = q.replace("'s", " 's ")
    words = q.lower().split()[:MaxQuestionLength]
    #words = map(lambda x: x.strip(), words)
    #words = filter(lambda x: len(x) > 0, words)
    #words = map(lambda w: w[:-1] + " s" if len(w)>3 and w[-1]=='s' and 
    #            not w in ("this", "does") else w, words)
    return " ".join(words)

def word_frequencies(questions):
    words = " ".join(questions).split(" ")
    counter = Counter(words)
    return sorted(counter.items(), key=lambda x:-x[1])




In [4]:
train_df=pd.read_csv("/Users/igorvm/Projects/Neural/questions/data/train.csv")
train_df.fillna("", inplace=True)
print "Train set loaded:", len(train_df)

test_df = pd.read_csv("/Users/igorvm/Projects/Neural/questions/data/test.csv")
test_df.fillna("", inplace=True)
print "Test set loaded:", len(test_df)


Train set loaded: 404290
Test set loaded: 2345796


In [46]:

test_questions = map(normalize_question, test_df["question1"].tolist() + test_df["question2"].tolist())
train_questions = map(normalize_question, train_df["question1"].tolist() + train_df["question2"].tolist())

print "questions normalized"



questions normalized


In [47]:
test_questions = set(test_questions)
train_questions = set(train_questions)
all_questions = test_questions | train_questions


In [48]:
for i, q in enumerate(test_questions):
    print i, q
    if i > 100: break

0 
1 what is an sell positive blood group and what are the characteristics ?
2 what is the weirdest me thing you've ever done to talk with a girl / guy you like ?
3 what are the side effects ( positive and negative ) , if any , of hit ?
4 what are good sides for macaroni and locations ?
5 what are grow best in the uk ?
6 how can you determine the combustion of methane ?
7 are there some real websites that offer real fund for part - time work like data - entry , ad - posting jobs ? with minimum registration fees of course . ?
8 what there any permanent treatment for tinnitus ?
9 what is the significance of free trade agreement among a group of nations ?
10 are there any is the best website to download english songs for an android phone ?
11 what from a university vpn , then can the mpaa / riaa still find a way to track you down ?
12 how do you improve your design skills as designer ?
13 why does telugu actor allu arjun have more fans in tamil nadu than in andhra pradesh ?
14 my boyfrien

In [49]:
all_frequencies = word_frequencies(all_questions)
#train_frequencies = word_frequencies(train_questions)


In [50]:
print "Total number of questions:", len(all_questions)
#print "Different words in train questions:", len(train_frequencies)
print "Different words in all questions:", len(all_frequencies)

Total number of questions: 4783352
Different words in all questions: 138927


In [51]:
VocabularySize = 10000

In [52]:
vocabulary_words = [w for w, n in all_frequencies[:VocabularySize]]
vocabulary_encoding = {w:i+1 for i, w in enumerate(vocabulary_words)}
vocabulary_decoding = [None] + vocabulary_words
vocabulary_set = set(vocabulary_words)

In [53]:
encoding_df = pd.DataFrame(range(1, 1+VocabularySize), index=vocabulary_words, columns=["encoding"])
encoding_df.head()

Unnamed: 0,encoding
?,1
the,2
what,3
is,4
i,5


In [54]:
decoding_df = pd.DataFrame(vocabulary_words, index = range(1, 1+VocabularySize), columns=["decoding"])
decoding_df.head()

Unnamed: 0,decoding
1,?
2,the
3,what
4,is
5,i


In [55]:
def encode_question(q):
    # question is already normalized
    words = q.split()
    wset = set(words)
    missing_words = wset - vocabulary_set
    missing_words_encoding = {w:VocabularySize+1+random.randint(1,MaxMissingWords-1) 
                              for i, w in enumerate(missing_words)}
    encoded = np.array([vocabulary_encoding[w] 
            if w in vocabulary_set
            else missing_words_encoding[w]           
            for w in words], 
        dtype=np.uint16)
    return encoded

def decode_question(words):
    return " ".join(map(lambda x: vocabulary_decoding[x] if x <= VocabularySize else "<%d>" % (x,), words))

def permute_encoded_question(q):
    if len(q) <= 0: return q
    q = q.copy()
    n = len(q)
    m = max(1, n/3)
    r = range(n)
    for i in random.sample(r, m):
        q[i] = random.randint(1, VocabularySize)
    for _ in xrange(n/10):
        i,j = random.sample(r, 2)
        q[i], q[j] = q[j], q[i]
    return q


In [56]:
for i, q in enumerate(all_questions):
    encoded = encode_question(q)
    decoded = decode_question(encoded)
    print q
    #print encoded
    print decoded
    print
    if i > 30: break




what is an sell positive blood group and what are the characteristics ?
what is an sell positive blood group and what are the characteristics ?

what are good sides for macaroni and locations ?
what are good sides for <10048> and locations ?

my boyfriend started ignoring me for no reason . should i do ?
my boyfriend started ignoring me for no reason . should i do ?

in the summer how would i dress like a workout guy from the year 1990 ?
in the summer how would i dress like a workout guy from the year 1990 ?

why did not the jews build a jewish nation - state in overthrow empty area ?
why did not the jews build a jewish nation - state in <10079> empty area ?

what " electrosmog " kind of coding questions are asked in samsung 's online competency test ?
what " <10002> " kind of coding questions are asked in samsung 's online <10024> test ?

how many nukes does north korea have ?
how many nukes does north korea have ?

will it be a good idea to do mba app if you are earning 8 lpa and 

In [57]:
good_sentences = map(encode_question, all_questions)
for s in good_sentences[:100]:
    print decode_question(s)


what is an sell positive blood group and what are the characteristics ?
what are good sides for <10068> and locations ?
my boyfriend started ignoring me for no reason . should i do ?
in the summer how would i dress like a workout guy from the year 1990 ?
why did not the jews build a jewish nation - state in <10027> empty area ?
what " <10054> " kind of coding questions are asked in samsung 's online <10032> test ?
how many nukes does north korea have ?
will it be a good idea to do mba app if you are earning 8 lpa and having <10032> experience ?
how do i form real estate investing fund for moral investing ?
how do i teach english abroad if i am not a native and have no degree ?
why ca not the religion and the caste columns in all pakistanis sort of application forms be removed ? wo not it bring social equality ?
what is sick android studio , eclipse , or <10016> for android development ?
why do not <10064> and <10053> pair up for mixed <10054> ?
how do get achieve world peace ?
how can

In [58]:
bad_sentences = map(permute_encoded_question, good_sentences)
for s in bad_sentences[:100]:
    print decode_question(s)


what is an sell targaryen blood what org sent are the characteristics library
what kochi torah sides for <10068> genetically locations ?
my boyfriend started ignoring . thc no final me should i spotify $100
in ? nominee how would i dress like a interacting connected gf the year 1990 selenium
why did not the jews in turks jewish provide collage arts build <10027> empty doc ?
online " stadium " kind of instantly questions intj asked in rukh 's what balanced test ?
how inspiration nukes does north millions have ?
nitrate d'or be a good idea cambodia do mba app hypothetically tumblr are timor 8 lpa experience having <10032> hbo ?
caught important i form real rahul investing status for moral investing ?
how do tvf teach english beside if sikh am not a falls fascinating have native degree ?
why school we've royale equality modern the caste columns in disgusting pakistanis sort of application ca torah removed ? wo not it bring social blades changes
000 is looks specific studio , eclipse , or

In [59]:
sentences = [(s, 1) for s in good_sentences] + [(s, 0) for s in bad_sentences]
random.shuffle(sentences)

sentences_series = pd.Series([s for s, c in sentences])
class_series = pd.Series([c for s, c in sentences], dtype=np.uint8)
sentences_df = pd.DataFrame()
sentences_df["sentence"] = sentences_series
sentences_df["classification"] = class_series

print "df head"
print sentences_df.head()

print "sentence"
print sentences_df["sentence"][:10]

print "classification"
print sentences_df["classification"][:10]





df head
                                            sentence  classification
0                              [3, 9771, 4, 4982, 1]               0
1  [3, 12, 39, 10, 2, 4881, 19, 35, 27, 19, 266, ...               1
2  [3499, 62, 4575, 5, 178, 21, 1608, 9720, 337, ...               0
3  [4, 10069, 184, 2, 2112, 1542, 499, 42, 72, 2,...               1
4  [3, 23, 1851, 1727, 696, 5578, 6340, 344, 3460...               0
sentence
0                                [3, 9771, 4, 4982, 1]
1    [3, 12, 39, 10, 2, 4881, 19, 35, 27, 19, 266, ...
2    [3499, 62, 4575, 5, 178, 21, 1608, 9720, 337, ...
3    [4, 10069, 184, 2, 2112, 1542, 499, 42, 72, 2,...
4    [3, 23, 1851, 1727, 696, 5578, 6340, 344, 3460...
5             [67, 1846, 1939, 3937, 9456, 5184, 9195]
6          [4, 2, 864, 25, 3654, 49, 25, 2364, 437, 1]
7    [7, 8237, 5, 469, 6, 821, 30, 325, 4443, 40, 3...
8                   [7, 15, 5, 104, 21, 6326, 7302, 1]
9                                   [3, 4, 470, 47, 1]
Name: sentence, dty

In [60]:
store = pd.HDFStore("/Users/igorvm/Projects/Neural/questions/data/sentences_train.hd5")
store["decoding"] = decoding_df
store["sentences"] = sentences_df
store.close()



[128   2  99 ..., 116  98  46] 133859
[128   2  99 ..., 116  98  46] 533280335


In [61]:
vocabulary_file = open("/Users/igorvm/Projects/Neural/questions/data/vocabulary.tsv", "w")
vocabulary_file.write("word\tcode\n<none>\t0\n")
for i, w in decoding_df.itertuples():
    vocabulary_file.write("%s\t%d\n" % (w, i))
for i in range(MaxMissingWords):
    vocabulary_file.write("<%d>\t%d\n" % (VocabularySize+i+1,VocabularySize+i+1))
vocabulary_file.close()