-
Notifications
You must be signed in to change notification settings - Fork 6
/
get_wikipedia_data.py
175 lines (168 loc) · 7.39 KB
/
get_wikipedia_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import wikipedia as wiki
import util
def get_word_count_train_validation():
d_word_count_t_q = util.get_d_word_count_train_question()
d_word_count_t_c = util.get_d_word_count_train_choice()
d_word_count_v_q = util.get_d_word_count_validation_question()
d_word_count_v_c = util.get_d_word_count_validation_choice()
d_word_count = {}
for word in d_word_count_t_q.keys():
d_word_count.setdefault(word, 0)
d_word_count[word] += d_word_count_t_q[word]
for word in d_word_count_t_c.keys():
d_word_count.setdefault(word, 0)
d_word_count[word] += d_word_count_t_c[word]
for word in d_word_count_v_q.keys():
d_word_count.setdefault(word, 0)
d_word_count[word] += d_word_count_v_q[word]
for word in d_word_count_v_c.keys():
d_word_count.setdefault(word, 0)
d_word_count[word] += d_word_count_v_c[word]
return d_word_count
'''
sort = sorted(d_word_count.iteritems(), key = lambda dd : dd[1])
for s in sort:
print "%s\t%d" % (s[0], s[1])
'''
def get_wikipedia_content_based_on_word_count_train_validation(d_word_count):
file = open('data/wikipedia_content_v1.txt', 'w')
n_word = len(d_word_count.keys())
n_current = 0
for word in d_word_count.keys():
n_current += 1
print word, n_current, n_word, n_current * 1.0 / n_word
if not word:
continue
lst_title = wiki.search(word)
if len(lst_title) >= 1:
for title in lst_title:
title = title.encode('ascii', 'ignore')
print 'title', title
try:
content = wiki.page(title).content.encode('ascii', 'ignore')
except wiki.exceptions.DisambiguationError as e:
print 'DisambiguationError', word
'''
for title_disambiguation in e.options:
title_disambiguation = title_disambiguation.encode('ascii', 'ignore')
print 'title_disambiguation', title_disambiguation
try:
content = wiki.page(title_disambiguation).content.encode('ascii', 'ignore')
except:
pass
'''
except:
pass
for line in content.split('\n'):
line = ' '.join(map(util.norm_word, line.split()))
if line:
file.write(line + '\n')
file.close()
def get_wikipedia_content_based_on_ck_12_keyword():
path_keyword = 'data/ck12_list_keyword.txt'
lst_keyword = open(path_keyword).readlines()
n_total = len(lst_keyword)
file = open('data/wikipedia_content_based_on_ck_12_keyword_v1.txt', 'w')
for index, line in enumerate(lst_keyword):
keyword = line.strip('\n').lower()
print index, n_total, index * 1.0 / n_total, keyword
try:
content = wiki.page(keyword).content.encode('ascii', 'ignore')
except wiki.exceptions.DisambiguationError as e:
print 'DisambiguationError', keyword
except:
print 'Error', keyword
if not content:
continue
for line in content.split('\n'):
line = ' '.join(map(util.norm_word, line.split()))
if line:
file.write(line + '\n')
file.close()
def get_wikipedia_content_based_on_ck_12_keyword_one_file_per_keyword():
'''
Get wikipedia page content based on the keywords crawled from the ck-12 website.
'''
#path_keyword = 'data/ck12_list_keyword.txt'
#dir_output = 'data/wikipedia_content_based_on_ck_12_keyword_one_file_per_keyword/'
#path_keyword = 'data/training_set_question.tsv'
#dir_output = 'data/wikipedia_content_based_on_train_question_one_file_per_keyword/'
#path_keyword = 'data/validation_set_question.tsv'
#dir_output = 'data/wikipedia_content_based_on_validation_question_one_file_per_keyword/'
#path_keyword = 'data/ck12_list_keyword.txt'
#dir_output = 'data/wikipedia_content_based_on_ck_12_keyword_one_file_per_keyword_plus_external_links/'
path_keyword = 'data/training_set_question.tsv'
dir_output = 'data/wikipedia_content_based_on_train_question_one_file_per_keyword_plus_external_links/'
#path_keyword = 'data/validation_set_question.tsv'
#dir_output = 'data/wikipedia_content_based_on_validation_question_one_file_per_keyword_plus_external_links/'
path_meta = path_keyword[:-4] + '_plus_external_links_wiki_meta.tsv'
file_meta = open(path_meta, 'w')
lst_keyword = open(path_keyword).readlines()
n_total = len(lst_keyword)
for index, line in enumerate(lst_keyword):
print index, n_total, index * 1.0 / n_total, line.strip()
get_content_and_meta_basedon_keyword(line, dir_output, file_meta)
try:
page = wiki.page(line)
lst_link = page.links
for keyword in lst_link:
keyword = keyword.encode('ascii', 'ignore')
get_content_and_meta_basedon_keyword(keyword, dir_output, file_meta)
except:
pass
file_meta.close()
def get_content_and_meta_basedon_keyword(line, dir_output, file_meta):
keyword = line.strip('\n').lower()
content = None
title = None
try:
page = wiki.page(keyword)
content = page.content.encode('ascii', 'ignore')
url = page.url.encode('ascii', 'ignore')
title = page.title.encode('ascii', 'ignore')
except wiki.exceptions.DisambiguationError as e:
print 'DisambiguationError', keyword
except:
print 'Error', keyword
if not content or not title:
pass
else:
file_meta.write("%s\t%s\t%s\n" % (keyword, title, url))
path_output = dir_output + '/' + '_'.join(title.replace('/', '__').split()) + '.txt'
file = open(path_output, 'w')
for line in content.split('\n'):
line = ' '.join(map(util.norm_word, line.split()))
if line:
file.write(line + '\n')
file.close()
def get_wikipedia_meta_based_on_ck_12_keyword_one_file_per_keyword():
'''
Get wikipedia title, url information for the wikipedia page of the keywords
'''
path_keyword = 'data/ck12_list_keyword.txt'
file = open(path_keyword[:-4] + '_meta.tsv', 'w')
lst_keyword = open(path_keyword).readlines()
n_total = len(lst_keyword)
for index, line in enumerate(lst_keyword):
keyword = line.strip('\n').lower()
print index, n_total, index * 1.0 / n_total, keyword
try:
url = wiki.page(keyword).url.encode('ascii', 'ignore')
title = wiki.page(keyword).title.encode('ascii', 'ignore')
#content = wiki.page(keyword).content.encode('ascii', 'ignore')
except wiki.exceptions.DisambiguationError as e:
print 'DisambiguationError', keyword
except:
print 'Error', keyword
res = "%s\t%s\t%s\n" % (keyword, title, url)
file.write(res)
file.close()
#get_wikipedia_meta_based_on_ck_12_keyword_one_file_per_keyword()
get_wikipedia_content_based_on_ck_12_keyword_one_file_per_keyword()
#get_wikipedia_content_based_on_ck_12_keyword()
'''
path_keyword = 'data/ck12_list_keyword.txt'
dir_output = 'data/wikipedia_content_based_on_ck_12_keyword_one_file_per_keyword/'
d = get_word_count_train_validation()
get_wikipedia_content_based_on_word_count_train_validation(d)
'''