-
Notifications
You must be signed in to change notification settings - Fork 4
/
positional.py
109 lines (92 loc) · 3.32 KB
/
positional.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
tx_tag = re.compile("tx[0-9]+")
import matplotlib.pyplot as plt
from matplotlib.pyplot import plot, hist
import seaborn as sns
# "muted"
sns.set(palette="muted", style="nogrid")
import numpy as np
from indexnumbers import swap_num
import bilearn
from taggedpipeline import TaggedTextPipeline
from journalreaders import LabeledAbstractReader
from tokenizer import MergedTaggedAbstractReader
import progressbar
def _sentence_contains(tagged_sentence, tag="n"):
return any([tag in t[1] for t in tagged_sentence])
def count_treatments(citation):
tx_ns = [0]
for sentence_num, sentence in enumerate(citation):
for token, tag_set in sentence:
for tag in tag_set:
matches = tx_tag.findall(tag)
for match in matches:
# remove _a (e.g., tx1_a)
tx_ns.append(int(match[2:]))
return max(tx_ns)
def hist_of_tx_counts(plot_them=True):
reader = MergedTaggedAbstractReader()
n_txs = []
for citation in reader:
n_txs.append(count_treatments(citation))
if plot_them:
plt.clf()
max_num = max(n_txs)+1
#counts = [0]*max_num
#x = range(max_num)
nbins = max_num
counts, bins, patches = hist(
n_txs, bins=nbins, alpha=.8, normed=True,
align='mid', label=["1"])
offset = abs(bins[1]-bins[0])/2.0
#import pdb; pdb.set_trace()
plt.xticks([b+offset for b in bins], [0, 1, 2, 3, 4])
plt.xlabel("number of treament groups")
#plt.xticks(bins)
return n_txs
def pos_deltas(tag1="tx1", tag2="tx2", plot_them=True):
reader = MergedTaggedAbstractReader()
deltas = []
for citation_index in xrange(len(reader)):
pos1, pos2 = None, None
citation = reader.get(citation_index)
# number of citations
citation_length = len(citation)
for sentence_num, sentence in enumerate(citation):
#print float(sentence_num+1)/citation_length
if _sentence_contains(sentence, tag1):
pos1 = float(sentence_num+1)/citation_length
if _sentence_contains(sentence, tag2):
pos2 = float(sentence_num+1)/citation_length
if pos1 is not None and pos2 is not None:
deltas.append(abs(pos1-pos2))
if plot_them:
plt.clf()
hist(deltas, alpha=.8)
plt.xlim(0,1)
plt.xlabel("(normalized) sentence distance between '%s' and '%s' tokens" %
(tag1, tag2))
return deltas
def tag_positions(tag="n"):
reader = MergedTaggedAbstractReader()
positions = []
for citation_index in xrange(len(reader)):
citation = reader.get(citation_index)
# number of citations
citation_length = len(citation)
for sentence_num, sentence in enumerate(citation):
if _sentence_contains(sentence, tag):
pos = float(sentence_num)/citation_length
positions.append(pos)
return positions
def txs_histos():
plt.clf()
tx1_positions = tag_positions(tag="tx1")
tx2_positions = tag_positions(tag="tx2")
hist(tx1_positions, alpha=1, label="tx1")
hist(tx2_positions, alpha=.5, label="tx2")
plt.xlim(0,1)
plt.legend()
def positional_histo(tag="n"):
positions = tag_positions(tag)
hist(positions)