In [1]:
import sys
import os
import codecs

import json
from collections import Counter

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import torch

import config

In [2]:
def en_wordtokenizer(sentence):
    words = WordPunctTokenizer().tokenize(sentence.lower())
    return words
stop_words = set(stopwords.words('english')) 

In [3]:
if not os.path.exists('data'):
    os.mkdir('data')

### Elastic Search
    Use Elastic Search to search from openbook.txt
    Please make sure corpus has been put into Elastic Search using the index name "corpus". (es_create.py)

In [4]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

In [5]:
def es_sentences(sentence, sentence_size = 10, index_name = 'corpus', isParagraph = True):
    search_body = {
        "query": {"match":{"text": 
        {
            "query": sentence
        }}},
        "size":  sentence_size
    }
    res_objs =es.search(index=index_name, body=search_body)['hits']['hits']
    res = []
    for obj in res_objs:
        res.append(obj['_source']['text'][1:-1] + ' . ')
    return res

### Word Frequency
    Calculate and build the word frequency dictionary from openbook.txt

In [6]:
frequency_vocab = Counter()
with codecs.open(config.corpus_path, 'r', 'utf-8') as fpr:
    for line in fpr:
        wordlist = en_wordtokenizer(line.strip())
        for word in wordlist:
            if word.lower() in frequency_vocab:
                frequency_vocab[word.lower()] += 1
            else:
                frequency_vocab[word.lower()] = 1

In [7]:
frequency_vocab_dict = {word[0]: ids for ids, word in enumerate(frequency_vocab.most_common())}

### Load Json

In [8]:
question_set = []
vocab = Counter()
filenames = ['train', 'test', 'dev']

In [9]:
def AddWordToVocab(wordlist):
    for word in wordlist:
        if word.lower() in vocab:
            vocab[word.lower()] += 1
        else:
            vocab[word.lower()] = 1

def DeleteFrequentWord(sentence):
    return [word for word in sentence if not word in stop_words]

In [10]:
def ReadQuestion_json(data_path, isParagraph = True):
    question_set = []
    with codecs.open(data_path, 'r', 'utf-8') as data_f:
        for line in data_f:
            sample = json.loads(line.strip())
            question = sample['question']['stem']
            choices = sample['question']['choices']
            question_text = en_wordtokenizer(question)
            AddWordToVocab(question_text)
            
            q_c_cat = []
            for choice in choices:
                choice_text = en_wordtokenizer(choice['text'])
                AddWordToVocab(choice_text)
                
                # ES
                q_c = ' '.join(DeleteFrequentWord(choice_text + question_text))
                q_c_es = es_sentences(q_c, 10, 'corpus', False)
                for sen in q_c_es:
                    AddWordToVocab(en_wordtokenizer(sen))
                
                q_c_cat.append({"text": choice['text'], "es": q_c_es})
            answer = sample['answerKey']
            question_set.append({'question': question, 'choices': q_c_cat, 'answer': answer})
            
    return question_set

In [11]:
train_set = ReadQuestion_json(config.train_json_path, False)
test_set = ReadQuestion_json(config.test_json_path, False)
dev_set = ReadQuestion_json(config.dev_json_path, False)

### Write Vocab File

In [12]:
from concept_config import rel_projection
for key, value in rel_projection.items():
    AddWordToVocab(value)
    
vocab_list = ['<PAD>'] + sorted(vocab)
vocab_size = len(vocab_list)
vocab_dict = {word: i for i, word in enumerate(vocab_list) }

vocab_name = 'data/vocab.txt'
with codecs.open(vocab_name, 'w', 'utf-8') as fpw:
    fpw.write('\n'.join(vocab_list))

### Search ConceptNet 
    All the triplet of ConceptNet have been put in mysql (subject and object of the triplet should be english). Below is a example of the table for conceptnet.
    
```
mysql> select * from en_graph where start = 'machine';
+--------------+---------+-------------+
| rel                       |      start     |        end            |
+--------------+---------+-------------+
| Antonym      | machine | human                           |
| Antonym      | machine | organic                          |
| Antonym      | machine | organic_things                |
```

In [13]:
mysql_user = 'root'
mysql_password = 'maxinyin'
mysql_database = 'conceptnet'

In [14]:
from nltk.stem import PorterStemmer
from concept_config import rel_projection
ps = PorterStemmer()

In [15]:
vocab_most = {word[0]:ids for ids, word in enumerate(vocab.most_common())}
def testValid(sentence):
    for word in sentence:
        if word not in vocab_list:
            return None
    return sentence

def chooseSpecialWord(sentence, max_select = 3):
    max_value = sorted(enumerate([vocab_most[word] for word in sentence]), key=lambda x:x[1], reverse=True)[0:min(max_select, len(sentence))]
    return [sentence[ids] for ids, value in max_value]

In [28]:
import pymysql
db = pymysql.connect("localhost", mysql_user, mysql_password, mysql_database)
cursor = db.cursor()
select_statement = " \
    select distinct a.start, a.rel, a.end, b.rel, '{end_word}' \
    from   \
    ( \
        select start, end, rel from en_graph where start = '{start_word}' and end <> '{end_word}' \
    ) a  \
    left join en_graph b \
    on a.end = b.start  \
    where b.end like '%\_{end_word}\_%' or b.end = '{end_word}' or b.end like '{end_word}\_%' or b.end like '%\_{end_word}' \
          and a.start != a.end and b.start != b.end \
    union all \
    select start, rel, end, '', '' \
    from en_graph \
    where start = '{start_word}' and \
          (end like '%\_{end_word}\_%' or end = '{end_word}' or end like '{end_word}\_%' or end like '%\_{end_word}') ;\
"

In [29]:
def deal_five_tuple(r, start_word, end_word):
    start, rel, interstart, interrel, end = r
    interstart = testValid(interstart.split('_'))
    rel, interrel = rel_projection.get(rel, None), rel_projection.get(interrel, None)
    if interstart == None:
        return None
    return {'start': start_word,
              'rel': rel,
              'interstart': interstart,
              'interrel': interrel,
              'end': end_word}

In [30]:
import time

def get_concept(dataset):
    start_time = time.time()
    ids, total_sample = 0, len(dataset)
    for sample in dataset:
        ids += 1
        if ids% 100 == 0:
            print("Processing {}/{}".format(ids, total_sample))
        sample_question = chooseSpecialWord([word for word in en_wordtokenizer(sample['question']) if not word in stop_words])

        for choice in sample['choices']:
            sample_choice = chooseSpecialWord([word for word in en_wordtokenizer(choice['text']) if not word in stop_words])
            q_c_set = [(q_word, c_word) for c_word in sample_choice for q_word in sample_question]
            rel_count = 0
            conceptnet_rel = []

            for q_c in q_c_set:
                res_1, res_2 = [], []
                try:
                    cursor.execute(select_statement.format(start_word= ps.stem(q_c[0]), end_word=ps.stem(q_c[1])))
                    res_1 = cursor.fetchall()
                    if len(res_1) == 0:
                        cursor.execute(select_statement.format(start_word= ps.stem(q_c[1]), end_word=ps.stem(q_c[0])))
                        res_2 = cursor.fetchall()
                except:
                    print('[ERROR]: ', q_c)

                if len(res_1) > 0:
                    conceptnet_rel.append(deal_five_tuple(res_1[0], q_c[0], q_c[1]))
                elif len(res_2) > 0:
                    conceptnet_rel.append(deal_five_tuple(res_2[0], q_c[1], q_c[0]))
            choice['concept'] = conceptnet_rel
    print("use_time: ",time.time() - start_time)

In [31]:
get_concept(train_set)
get_concept(test_set)
get_concept(dev_set)

[ERROR]:  ("'", 'entire')
[ERROR]:  ("'", 'smooth')
[ERROR]:  ("'", 'surface')
[ERROR]:  ("'", 'cavities')
[ERROR]:  ("'", 'explosions')
[ERROR]:  ("'", 'large')
[ERROR]:  ("'", 'core')
[ERROR]:  ("'", 'cheese')
[ERROR]:  ("'", 'internal')
[ERROR]:  ("'", 'filled')
[ERROR]:  ("'", 'lakes')
[ERROR]:  ('flow', "'")
[ERROR]:  ('?', "'")
[ERROR]:  ('conductor', "'")
[ERROR]:  ('flow', "'")
[ERROR]:  ('?', "'")
[ERROR]:  ('conductor', "'")
[ERROR]:  ('flow', "'")
[ERROR]:  ('?', "'")
[ERROR]:  ('conductor', "'")
[ERROR]:  ("'", 'tranquilitiy')
[ERROR]:  ("'", 'sea')
[ERROR]:  ("'", 'caspian')
[ERROR]:  ("'", 'sea')
[ERROR]:  ("'", 'sea')
[ERROR]:  ("'", 'dead')
[ERROR]:  ("'", 'caribbean')
[ERROR]:  ("'", 'sea')
Processing 100/4957
Processing 200/4957
[ERROR]:  ('chopped', "'")
[ERROR]:  ('apple', "'")
[ERROR]:  ('place', "'")
[ERROR]:  ('caused', "'")
[ERROR]:  ('seasons', "'")
Processing 300/4957
[ERROR]:  ('duck', "'")
[ERROR]:  ('would', "'")
[ERROR]:  ('melting', "'")
[ERROR]:  ('duck'

Processing 3500/4957
Processing 3600/4957
[ERROR]:  ('remote', "'")
[ERROR]:  ('farm', "'")
[ERROR]:  ('fridge', "'")
[ERROR]:  ('remote', "'")
[ERROR]:  ('farm', "'")
[ERROR]:  ('fridge', "'")
[ERROR]:  ('remote', "'")
[ERROR]:  ('farm', "'")
[ERROR]:  ('fridge', "'")
[ERROR]:  ('wasted', "'")
[ERROR]:  ('stare', "'")
[ERROR]:  ('rotations', "'")
[ERROR]:  ('18th', "'")
[ERROR]:  ('century', "'")
[ERROR]:  ('crossing', "'")
[ERROR]:  ('permanent', "'")
[ERROR]:  ('fixed', "'")
[ERROR]:  ('due', "'")
[ERROR]:  ('way', "'")
[ERROR]:  ('always', "'")
[ERROR]:  ('make', "'")
Processing 3700/4957
[ERROR]:  ('celestial', "'")
[ERROR]:  ('mass', "'")
[ERROR]:  ('body', "'")
[ERROR]:  ('guavas', "'")
[ERROR]:  ('leaves', "'")
[ERROR]:  ('many', "'")
Processing 3800/4957
[ERROR]:  ('harbor', "'")
[ERROR]:  ('pearl', "'")
[ERROR]:  ('japan', "'")
[ERROR]:  ('stranded', "'")
[ERROR]:  ('potentially', "'")
[ERROR]:  ('sahara', "'")
[ERROR]:  ('gull', "'")
[ERROR]:  ('build', "'")
[ERROR]:  ('nest

### Generate Index

In [62]:
import csv
def write_index(dataset, filename): 
    print('preprocessing ' + filename + '...')
    fpw = open(os.path.join('data/', filename+'_info.tsv'), 'w')
    writer=csv.writer(fpw, delimiter='\t')
    for sample in dataset:
        row = []
        
        #question =  [str(vocab_dict[word.lower()]) for word in en_wordtokenizer(sample['question'])]
        question =  [word.lower() for word in en_wordtokenizer(sample['question'])]
        row.append(' '.join(question))
        choices = sample['choices']
        
        for choice in choices:
            #text = question + [str(vocab_dict[word.lower()]) for word in en_wordtokenizer(choice['text'])]
            text = [word.lower() for word in en_wordtokenizer(choice['text'])]
            #concept_text = [str(word_index[word.lower()]) for word in en_wordtokenizer(choice['concept'])]
            row.append(' '.join(text))
            
            # concept
            concepts_count = 0
            for concept in choice.get('concept', []):
                if concept == None:
                    continue
                if concept['interrel'] == None:
                    concept_sentence = concept['start'] + ' , ' + ' '.join(concept['rel']) + ' , ' + ' '.join(concept['interstart']) + ' .'
                else:
                    concept_sentence = concept['start'] + ' , ' + ' '.join(concept['rel']) + ' , ' + ' '.join(concept['interstart']) + \
                                   ' , ' + ' '.join(concept['interrel']) + ' , ' + concept['end'] + ' .'
                try:
                    #concept_single = [str(vocab_dict[word.lower()]) for word in en_wordtokenizer(concept_sentence)]
                    concept_single = [word.lower() for word in en_wordtokenizer(concept_sentence)]
                except:
                    print(choice)
                    fpw.close()
                    return 
                row.append(' '.join(concept_single))
                concepts_count += 1
            for _ in range(9-concepts_count):
                row.append('.')
            
            # es
            ess = choice.get('es', [])
            for es in ess:
                row.append(es)
            for _ in range(10-len(ess)):
                row.append('.')
                
        label = str(sample['answer'])
        row.append(label)
        writer.writerow(row)

    fpw.close()

In [63]:
write_index(dev_set, 'dev')
write_index(train_set, 'train')
write_index(test_set, 'test')

preprocessing dev...
preprocessing train...
preprocessing test...


### Load Glove

In [None]:
'''
def find_in_ordered_list_binary(word):
    start_id = 0
    end_id = vocab_size - 1
    while start_id <= end_id:
        mid_id = (start_id + end_id) // 2
        if vocab_list[mid_id] == word:
            return True
        elif word > vocab_list[mid_id]:
            start_id = mid_id + 1
        else:
            end_id = mid_id - 1
    return False
    
def GenerateSmallGlove(glove_path):
    glove_embed = {}
    glove_vocab = []
    
    with codecs.open(glove_path, 'r', 'utf-8') as glove_f:
        for line in glove_f:
            row = line.strip().split(' ')
            if find_in_ordered_list_binary(row[0]):
                glove_embed[row[0]] = row[1:]
                glove_vocab.append(row[0])
    print("Glove find {} words".format(len(glove_vocab)))
    return glove_embed, glove_vocab
'''

In [None]:
'''
glove_word_embed, glove_word_vocab = GenerateSmallGlove('/data/mxy/nlp/glove.840B.300d.txt')
remaining_word = set(vocab_list) - set(glove_word_vocab)
print("{} words have no embedding".format(len(remaining_word)))
glove_remain_word = glove_word_vocab + list(remaining_word)
'''

In [None]:
'''
glove_filename = 'glove.300d.small'
glove_file = open(os.path.join('data/', glove_filename + '_concept.txt'), 'w')
zero_embd = ['0'] * 300
glove_file.write('<pad> ' + ' '.join(zero_embd) + '\n') 
for word in vocab_list[1:]:
    glove_file.write(word + ' ') 
    str_embd = [str(index) for index in glove_word_embed.get(word, zero_embd)]
    glove_file.write(' '.join(str_embd) + '\n')

glove_file.close()
'''