/
utilities_uncertainty.py
94 lines (85 loc) · 2.4 KB
/
utilities_uncertainty.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python
#####
# Description: Functions for reading config and data files
# Author: Heike Adel
# Date: 2016
#####
import gzip
def padAndReduceSentenceToContextsize(sentence, contextsize):
sentenceList = sentence.split()
for i in range(4):
sentenceList.insert(0, "PADDING")
sentenceList.append("PADDING")
while len(sentenceList) > contextsize:
sentenceList.pop(len(sentenceList) - 2) # do not pop PADDING
if len(sentenceList) > contextsize:
sentenceList.pop(1) # do not pop PADDING
while len(sentenceList) < contextsize:
sentenceList.append("<empty>")
return sentenceList
def reduceSentenceToContextsize(sentence, contextsize):
sentenceList = sentence.split()
while len(sentenceList) > contextsize:
sentenceList.pop(len(sentenceList) - 1)
if len(sentenceList) > contextsize:
sentenceList.pop(0)
while len(sentenceList) < contextsize:
sentenceList.append("<empty>")
return sentenceList
def openTokenizedFile(filename, contextsize):
f = open(filename, 'r')
labels = []
texts = []
for line in f:
line = line.strip()
label, sentence = line.split(' :: ')
labelInt = int(label)
sentenceReduced = padAndReduceSentenceToContextsize(sentence, contextsize)
labels.append(labelInt)
texts.append(sentenceReduced)
return texts, labels
def openTokenizedFileWithoutPadding(filename, contextsize):
f = open(filename, 'r')
labels = []
texts = []
for line in f:
line = line.strip()
label, sentence = line.split(' :: ')
labelInt = int(label)
sentenceReduced = reduceSentenceToContextsize(sentence, contextsize)
labels.append(labelInt)
texts.append(sentenceReduced)
return texts, labels
def readConfig(configfile):
config = {}
# read config file
f = open(configfile, 'r')
for line in f:
if "#" == line[0]:
continue # skip commentars
line = line.strip()
parts = line.split('=')
name = parts[0]
value = parts[1]
config[name] = value
f.close()
return config
def readWordvectors(wordvectorfile):
wordvectors = {}
vectorsize = 0
if ".gz" in wordvectorfile:
f = gzip.open(wordvectorfile, 'r')
else:
f = open(wordvectorfile, 'r')
count = 0
for line in f:
if count == 0:
count += 1
continue
parts = line.split()
word = parts[0]
parts.pop(0)
wordvectors[word] = parts
vectorsize = len(parts)
f.close()
return [wordvectors, vectorsize]