-
Notifications
You must be signed in to change notification settings - Fork 0
/
librusecimport.py
executable file
·369 lines (352 loc) · 14.5 KB
/
librusecimport.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#!/usr/bin/env python
# -*- mode: python; coding: utf-8; -*-
# (c) Lankier mailto:lankier@gmail.com
import sys, os
from hashlib import md5
import web
from brigantina import addfile, validator, libdb, utils
from fb2utils import parser # http://code.google.com/p/fb2utils/
src_dir = 'fb2'
dest_dir = 'files'
fixed_dir = 'fixed'
pgdb = web.database(dbn='postgres', db='library', user='con', pw='')
mydb = web.database(dbn='mysql', db='rate', user='www', pw='a1s2d3')
## -- временные индексы для импорта
def create_indexes():
pgdb.query('''\
create index files_librusecid_index on files (librusecid);
create index authors_librusecid_index on authors (librusecid);
create index authorsaliases_librusecid_index on authorsaliases (librusecid);
create index sequences_librusecid_index on sequences (librusecid);
create index publsequences_librusecid_index on publsequences (librusecid);''')
## -- удаление временных индексов (после импорта)
def drop_indexes():
pgdb.query('''\
drop index if exists files_librusecid_index cascade;
drop index if exists authors_librusecid_index cascade;
drop index if exists authorsaliases_librusecid_index cascade;
drop index if exists sequences_librusecid_index cascade;
drop index if exists publsequences_librusecid_index cascade;''')
def print_log(*s):
s = ' '.join(str(i) for i in s)
if isinstance(s, unicode):
s = s.encode('utf-8')
print s
sys.stdout.flush()
def import_authors():
nauthors = 0
for author in mydb.select('libavtorname'):
if 0:
try:
pgdb.select('authors', locals(),
where='librusecid=$author.AvtorId')[0]
except IndexError:
pass
else:
continue
try:
mydb.select('libavtoraliase', locals(),
where='BadId=$author.AvtorId')[0]
except IndexError:
pass
else:
# удалён
continue
if not author.FirstName and not author.LastName:
# безымянные авторы
continue
nbooks = 0
nbooks += mydb.select('libavtor', locals(), what='count(*) as count',
where='AvtorId = $author.AvtorId')[0].count
nbooks += mydb.select('libtranslator', locals(), what='count(*) as count',
where='TranslatorId = $author.AvtorId')[0].count
if nbooks == 0:
# нет книг
print_log('skip author:', author.AvtorId)
continue
newid = pgdb.insert('authors',
firstname=author.FirstName,
middlename=author.MiddleName,
lastname=author.LastName,
nickname=author.NickName,
librusecid=author.AvtorId)
for alias in mydb.select('libavtoraliase', locals(),
where='GoodId=$author.AvtorId'):
# другие имена
try:
a = mydb.select('libavtorname', locals(),
where='AvtorId=$alias.BadId')[0]
except IndexError:
continue
pgdb.insert('authorsaliases', False, authorid=newid,
firstname=a.FirstName, middlename=a.MiddleName,
lastname=a.LastName, nickname=a.NickName,
librusecid=a.AvtorId)
nauthors += 1
print 'total', nauthors, 'added'
def get_author(aid):
# поиск автора по алиасу
try:
res = mydb.select('libavtoraliase', locals(), where='BadId=$aid')[0]
except IndexError:
return None
try:
res = pgdb.select('authors', locals(), where='librusecid=$res.GoodId')[0]
except IndexError:
return None
return res.id
xpath_prefix = '/m:FictionBook/m:description/m:title-info'
def import_books():
nbooks = 0
nfiles = 0
for book in mydb.select('libbook'):
if book.Deleted == '1':
continue
if book.FileSize >= 2147483647:
# 2 GB (см. http://www.flibusta.net/b/170630)
print_log('too large:', book.BookId)
continue
if 0:
# продолжение после предыдущей ошибки
try:
pgdb.select('files', locals(), where='librusecid=$book.BookId')[0]
except IndexError:
pass
else:
# этот файл уже обработан
print_log('already exists:', book.BookId)
continue
if book.FileType == 'fb2':
fn = str(book.BookId)+'.fb2'
else:
try:
fn = mydb.select('libfilename', locals(),
where='BookId=$book.BookId')[0].FileName
except IndexError:
print_log('filename not found:', book.BookId)
continue
fn = os.path.join(src_dir, fn)
if book.FileType == 'fb2':
if not os.path.exists(fn):
print_log('not exists:', fn)
continue
soup = None
errors = []
if book.FileType == 'fb2':
xml = validator.check_file(fn, errors)
if not xml:
print_log('bad xml:', fn)
#continue
# исправляем xml
data = open(fn).read()
soup = parser.FB2Parser(data, convertEntities='xml')
xml = validator.check_str(str(soup.FictionBook), errors)
assert xml is not None
authors = set() # список id авторов
bookid = None # в какую книгу добавляем файл
alttitles = []
if book.Title1.startswith('='):
t1 = book.Title1[1:].strip()
alttitles = t1.split(';')
book.Title1 = ''
for a in mydb.select('libavtor', locals(), where='BookId=$book.BookId'):
try:
author = pgdb.select('authors', locals(),
where='librusecid=$a.AvtorId')[0]
except IndexError:
aid = get_author(a.AvtorId)
if not aid:
# иногда встречаются удалённые авторы
print_log('author not found:', book.BookId, a.AvtorId)
continue
else:
aid = author.id
authors.add(aid)
if not bookid:
books = libdb.find_book(book.Title, aid)
if books:
# найдена подходящая книга
bookid = books[0]
else:
for t in alttitles:
books = libdb.find_book(t, aid)
if books:
# найдена подходящая книга
bookid = books[0]
break
if not authors:
a = pgdb.select('authors', where="lastname='Автор неизвестен'")[0]
authors =[a.id]
assert authors # должен быть хотя бы один
if not bookid:
# создаём новую книгу
if book.FileType == 'fb2':
year = addfile.get_year(xml)
lang = addfile.get_elem(xml, xpath_prefix, 'src-lang')
bookid = pgdb.insert('books', year=year, lang=lang,
title=book.Title)
else:
bookid = pgdb.insert('books', title=book.Title)
# связываем с авторами
for aid in authors:
pgdb.insert('booksauthors', False, bookid=bookid, authorid=aid)
# жанры
genadded = False
for g in mydb.select('libgenre', locals(),
where='BookId=$book.BookId'):
genreid = mydb.select('libgenrelist', locals(),
where='GenreId=$g.GenreId')[0].GenreCode
#pgdb.select('genres', locals(), where='id=$genreid')[0]
pgdb.insert('booksgenres', False, bookid=bookid, genreid=genreid)
genadded = True
if not genadded:
# если не добавлен ни один жанр
pgdb.insert('booksgenres', False, bookid=bookid, genreid='other')
#print_log('book added:', bookid)
for t in alttitles:
try:
libdb.edit_book_add_alttitle(bookid, t.strip())
except libdb.DBError:
pass
nbooks += 1
# заголовок файла
if book.Title1:
title = book.Title1
else:
title = book.Title
#
if book.FileType == 'fb2':
txt = utils.fb2txt(xml)
textsize = len(txt)
fb2id = addfile.get_elem(xml, '/m:FictionBook/m:description/m:document-info', 'id')
v = addfile.get_elem(xml, '/m:FictionBook/m:description/m:document-info', 'version')
try:
fb2version = float(v)
except:
fb2version = None
ext = None
else:
textsize = book.FileSize
fb2id = None
fb2version = None
ext = os.path.splitext(fn)[1]
#
fileid = pgdb.insert('files', title=title,
lang=book.Lang, year=book.Year,
filetype=book.FileType,
fileauthor=book.FileAuthor,
filesize=book.FileSize, origext=ext,
textsize=textsize, md5=book.md5,
fb2id=fb2id, fb2version=fb2version,
librusecid=book.BookId)
fileid = str(fileid)
#print_log('file added:', fileid)
nfiles += 1
# сохраняем исправленный файл
if book.FileType == 'fb2' and soup:
f = os.path.join(fixed_dir, fileid)
open(f, 'w').write(str(soup.FictionBook))
# связь книга<->файл
pgdb.insert('booksfiles', False, bookid=bookid, fileid=fileid)
# переводчики
translators = set()
for trans in mydb.select('libtranslator', locals(),
where='BookId=$book.BookId'):
try:
transid = pgdb.select('authors', locals(),
where='librusecid=$trans.TranslatorId')[0]
except IndexError:
transid = get_author(trans.TranslatorId)
if not transid:
print_log('translator not found:', book.BookId, trans.TranslatorId)
continue
else:
transid = transid.id
if transid in translators:
continue
translators.add(transid)
pgdb.insert('bookstranslators', fileid=fileid, authorid=transid)
# сериалы
for seq in mydb.select('libseq', locals(), where='BookId=$book.BookId'):
seqname = mydb.select('libseqname', locals(),
where='SeqId=$seq.SeqId')[0].SeqName
if seq.Level >= 100:
# издательский сериал
try:
seqid = pgdb.select('publsequences', locals(),
where='librusecid=$seq.SeqId')[0].id
except IndexError:
seqid = pgdb.insert('publsequences', name=seqname,
librusecid=seq.SeqId)
pgdb.insert('filessequences', False, fileid=fileid,
sequenceid=seqid, sequencenumber=seq.SeqNumb)
else:
# авторский
try:
seqid = pgdb.select('sequences', locals(),
where='librusecid=$seq.SeqId')[0].id
except IndexError:
seqid = pgdb.insert('sequences', name=seqname,
librusecid=seq.SeqId)
try:
# возможно эта книга уже была засериализована другим файлом
pgdb.select('bookssequences', locals(),
where='bookid=$bookid and sequenceid=$seqid')[0]
except IndexError:
pgdb.insert('bookssequences', False, bookid=bookid,
sequenceid=seqid, sequencenumber=seq.SeqNumb)
if book.FileType == 'fb2':
# description
addfile.save_description(xml, fileid)
# annotation
addfile.save_annotation(xml, bookid, fileid)
# errors
libdb.save_fb2_errors(fileid, errors)
# covers
utils.makedir(fileid)
dir = os.path.join(dest_dir, fileid)
addfile.save_binaries(xml, dir, fileid)
# сохранение в других форматах
# txt
# html
# перемещение/копирование/ссылка файла
print_log(nbooks, 'books and', nfiles, 'files was imported')
def import_reviews():
for review in mydb.select('libreviews'):
print review.Text.encode('utf-8').replace('\\n', '\n').replace('\n', '\n\n').strip()
print '-'*70
def cleanup():
pgdb.delete('actions')
def get_md5(fn):
m = md5()
fd = open(fn)
while True:
s = fd.read(1024)
if not s:
break
m.update(s)
md5digest = m.hexdigest()
return md5digest
def check_files():
for book in mydb.select('libbook'):
if book.Deleted == '1':
continue
if book.FileType != 'fb2':
continue
fn = str(book.BookId)+'.fb2'
fn = os.path.join(src_dir, fn)
if not os.path.exists(fn):
print_log('not exists:', fn)
continue
md5digest = get_md5(fn)
if md5digest != book.md5:
print_log('md5 fail:', fn)
continue
#print_log('ok:', fn)
#create_indexes()
#check_files()
#import_authors()
#import_books()
import_reviews()
#drop_indexes()
#cleanup()