-
Notifications
You must be signed in to change notification settings - Fork 6
/
util.py
113 lines (99 loc) · 3.26 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from nltk.stem.porter import *
import itertools
import os
import string
exclude = set(string.punctuation)
stemmer = PorterStemmer()
path_train = 'data/training_set.tsv'
path_validation = 'data/validation_set.tsv'
def load_d_word_count(path = 'data/ck-12-word-count.txt'):
d = {}
for line in open(path):
lst = line.strip('\n').split('\t')
d[lst[0]] = int(lst[1])
return d
def combination_index(N, n_com):
res = []
s = ''
for i in range(N):
s += str(i)
for i in range(1, n_com + 1):
iter_com = itertools.combinations(s, i)
for com in iter_com:
com_tmp = [int(c) for c in com]
res.append(com_tmp)
return res
def norm_word(word):
# v1
#word = word.lower().strip('?').strip('.').strip(',').strip('!').strip(':').strip(';').strip('\"').strip('\'').strip()
word = word.lower().strip('?').strip('.').strip(',').strip('!')
# v2
#word = ''.join(ch for ch in word.lower() if ch not in exclude)
return word
'''
# Not work based on experiment 10004 and 10008.
try:
word = stemmer.stem(word.lower().strip('?').strip('.').strip(',').strip('!'))
except:
word = word.lower().strip('?').strip('.').strip(',').strip('!')
return word
'''
def get_sentence(dir):
lst_sentence = []
for path in os.listdir(dir):
path = dir + path
for line in open(path):
lst = line.strip('\n').split(' ')
lst_norm = [norm_word(word) for word in lst]
lst_sentence.append(lst_norm)
return lst_sentence
def get_d_word_count_train_question():
d_word_count = {}
for line in open(path_train):
lst = line.strip('\n').split('\t')
for word in lst[1].split(' '):
word = norm_word(word)
d_word_count.setdefault(word, 0)
d_word_count[word] += 1
return d_word_count
def get_d_word_count_train_choice():
d_word_count = {}
for line in open(path_train):
lst = line.strip('\n').split('\t')
for choice in lst[3:]:
for word in choice.split(' '):
word = norm_word(word)
d_word_count.setdefault(word, 0)
d_word_count[word] += 1
return d_word_count
def get_d_word_count_validation_question():
d_word_count = {}
for line in open(path_validation):
lst = line.strip('\n').split('\t')
for word in lst[1].split(' '):
word = norm_word(word)
d_word_count.setdefault(word, 0)
d_word_count[word] += 1
return d_word_count
def get_d_word_count_validation_choice():
d_word_count = {}
for line in open(path_validation):
lst = line.strip('\n').split('\t')
for choice in lst[2:]:
for word in choice.split(' '):
word = norm_word(word)
d_word_count.setdefault(word, 0)
d_word_count[word] += 1
return d_word_count
def get_questions(path):
lst_res = []
for index, line in enumerate(open(path)):
if index == 0:
continue
lst = line.strip('\n').split('\t')
lst_res.append(lst[1])
return lst_res
def get_questions_train():
return get_questions(path_train)
def get_questions_validation():
return get_questions(path_validation)