Skip to content
Browse files

Index all MBIDs, even the merged ones

  • Loading branch information...
1 parent 79f02d3 commit abfc548b87a714a33d7659ce80b6b45a31a41be4 @lalinsky committed Nov 10, 2012
Showing with 100 additions and 62 deletions.
  1. +1 −1 mbslave-solr-export.py
  2. +86 −57 mbslave/search.py
  3. +6 −3 solr/conf/schema.xml
  4. +1 −1 solr/conf/solrconfig.xml
  5. +6 −0 sql-extra/solr-queue.sql
View
2 mbslave-solr-export.py
@@ -11,7 +11,7 @@
db = connect_db(cfg, True)
print '<add>'
-for doc in fetch_all(cfg, db):
+for id, doc in fetch_all(cfg, db):
print ET.tostring(doc)
print '</add>'
View
143 mbslave/search.py
@@ -52,73 +52,80 @@ def __init__(self, name, foreign=None):
class ForeignColumn(Column):
- def __init__(self, table, name, foreign=None, null=False):
+ def __init__(self, table, name, foreign=None, null=False, backref=None):
super(ForeignColumn, self).__init__(name, foreign=foreign)
self.table = table
self.null = null
+ self.backref = backref
schema = Schema([
Entity('artist', [
- Field('id', Column('gid')),
+ Field('mbid', Column('gid')),
Field('disambiguation', Column('comment')),
Field('name', Column('name', ForeignColumn('artist_name', 'name'))),
Field('sort_name', Column('sort_name', ForeignColumn('artist_name', 'name'))),
Field('country', Column('country', ForeignColumn('country', 'name', null=True))),
Field('country_code', Column('country', ForeignColumn('country', 'iso_code', null=True))),
Field('gender', Column('gender', ForeignColumn('gender', 'name', null=True))),
Field('type', Column('type', ForeignColumn('artist_type', 'name', null=True))),
+ MultiField('mbid', ForeignColumn('artist_gid_redirect', 'gid', backref='new_id')),
MultiField('ipi', ForeignColumn('artist_ipi', 'ipi')),
MultiField('alias', ForeignColumn('artist_alias', 'name', ForeignColumn('artist_name', 'name'))),
]),
Entity('label', [
- Field('id', Column('gid')),
+ Field('mbid', Column('gid')),
Field('disambiguation', Column('comment')),
Field('code', Column('label_code')),
Field('name', Column('name', ForeignColumn('label_name', 'name'))),
Field('sort_name', Column('sort_name', ForeignColumn('label_name', 'name'))),
Field('country', Column('country', ForeignColumn('country', 'name', null=True))),
Field('country_code', Column('country', ForeignColumn('country', 'iso_code', null=True))),
Field('type', Column('type', ForeignColumn('label_type', 'name', null=True))),
+ MultiField('mbid', ForeignColumn('label_gid_redirect', 'gid', backref='new_id')),
MultiField('ipi', ForeignColumn('label_ipi', 'ipi')),
MultiField('alias', ForeignColumn('label_alias', 'name', ForeignColumn('label_name', 'name'))),
]),
Entity('work', [
- Field('id', Column('gid')),
+ Field('mbid', Column('gid')),
Field('disambiguation', Column('comment')),
Field('name', Column('name', ForeignColumn('work_name', 'name'))),
Field('type', Column('type', ForeignColumn('work_type', 'name', null=True))),
+ MultiField('mbid', ForeignColumn('work_gid_redirect', 'gid', backref='new_id')),
MultiField('iswc', ForeignColumn('iswc', 'iswc')),
MultiField('alias', ForeignColumn('work_alias', 'name', ForeignColumn('work_name', 'name'))),
]),
Entity('release_group', [
- Field('id', Column('gid')),
+ Field('mbid', Column('gid')),
Field('disambiguation', Column('comment')),
Field('name', Column('name', ForeignColumn('release_name', 'name'))),
Field('type', Column('type', ForeignColumn('release_group_primary_type', 'name', null=True))),
+ MultiField('mbid', ForeignColumn('release_group_gid_redirect', 'gid', backref='new_id')),
MultiField('type',
ForeignColumn('release_group_secondary_type_join', 'secondary_type',
ForeignColumn('release_group_secondary_type', 'name'))),
Field('artist', Column('artist_credit', ForeignColumn('artist_credit', 'name', ForeignColumn('artist_name', 'name')))),
MultiField('alias', ForeignColumn('release', 'name', ForeignColumn('release_name', 'name'))),
]),
Entity('release', [
- Field('id', Column('gid')),
+ Field('mbid', Column('gid')),
Field('disambiguation', Column('comment')),
Field('barcode', Column('barcode')),
Field('name', Column('name', ForeignColumn('release_name', 'name'))),
Field('status', Column('status', ForeignColumn('release_status', 'name', null=True))),
Field('type', Column('release_group', ForeignColumn('release_group', 'type', ForeignColumn('release_group_primary_type', 'name', null=True)))),
Field('artist', Column('artist_credit', ForeignColumn('artist_credit', 'name', ForeignColumn('artist_name', 'name')))),
+ MultiField('mbid', ForeignColumn('release_gid_redirect', 'gid', backref='new_id')),
MultiField('catno', ForeignColumn('release_label', 'catalog_number')),
MultiField('label', ForeignColumn('release_label', 'label', ForeignColumn('label', 'name', ForeignColumn('label_name', 'name')))),
Field('alias', Column('release_group', ForeignColumn('release_group', 'name', ForeignColumn('release_name', 'name')))),
]),
Entity('recording', [
- Field('id', Column('gid')),
+ Field('mbid', Column('gid')),
Field('disambiguation', Column('comment')),
Field('name', Column('name', ForeignColumn('track_name', 'name'))),
Field('artist', Column('artist_credit', ForeignColumn('artist_credit', 'name', ForeignColumn('artist_name', 'name')))),
+ MultiField('mbid', ForeignColumn('recording_gid_redirect', 'gid', backref='new_id')),
MultiField('alias', ForeignColumn('track', 'name', ForeignColumn('track_name', 'name'))),
]),
])
@@ -135,8 +142,9 @@ def __init__(self, table, name, foreign=None, null=False):
END;
$$ LANGUAGE 'plpgsql';
+DROP TRIGGER IF EXISTS mbslave_solr_tr_%(op1)s_%(table)s ON musicbrainz.%(table)s;
+CREATE TRIGGER mbslave_solr_tr_%(op1)s_%(table)s AFTER %(op2)s ON musicbrainz.%(table)s FOR EACH ROW EXECUTE PROCEDURE mbslave_solr_%(op1)s_%(table)s();
"""
-#--CREATE TRIGGER mbslave_solr_tr_%(op1)s_%(table)s AFTER %(op2)s ON musicbrainz.%(table)s FOR EACH ROW EXECUTE PROCEDURE mbslave_solr_%(op1)s_%(table)s();
def distinct_values(columns):
@@ -172,20 +180,21 @@ def generate_triggers():
for field in entity.iter_multi_fields():
column = field.column
+ backref = field.column.backref or entity.name
path = []
while column:
path.insert(0, (column.table, column.name))
column = column.foreign
for i in range(0, len(path)):
table, column, values = generate_trigger_update(path[i:])
- deps.setdefault(table, {}).setdefault((entity.name, 'NEW', entity.name, values), []).append(column)
+ deps.setdefault(table, {}).setdefault((entity.name, 'NEW', backref, values), []).append(column)
# Changed parent row
- deps.setdefault(field.column.table, {}).setdefault((entity.name, 'NEW', entity.name, None), []).append(entity.name)
- deps.setdefault(field.column.table, {}).setdefault((entity.name, 'OLD', entity.name, None), []).append(entity.name)
+ deps.setdefault(field.column.table, {}).setdefault((entity.name, 'NEW', backref, None), []).append(backref)
+ deps.setdefault(field.column.table, {}).setdefault((entity.name, 'OLD', backref, None), []).append(backref)
# Inserted or deleted new child row
- ins_del_deps.setdefault(field.column.table, set()).add((entity.name, entity.name))
+ ins_del_deps.setdefault(field.column.table, set()).add((entity.name, backref))
for table, kinds in sorted(ins_del_deps.items()):
sections = []
@@ -254,24 +263,24 @@ def iter_main(db, kind, ids=()):
names.append(field.name)
query = generate_iter_query(columns, joins, ids)
- with closing(db.cursor('cursor_' + kind)) as cursor:
- cursor.itersize = 100 * 1000
- cursor.execute(query, ids)
- for row in cursor:
- id = row[0]
- fields = [E.field(kind, name='kind')]
- for name, value in zip(names, row[1:]):
- if not value:
- continue
- if isinstance(value, str):
- value = value.decode('utf8')
- elif not isinstance(value, unicode):
- value = unicode(value)
- try:
- fields.append(E.field(value, name=name))
- except ValueError:
- continue # XXX
- yield id, fields
+ cursor = db.cursor('cursor_' + kind)
+ cursor.itersize = 100 * 1000
+ cursor.execute(query, ids)
+ for row in cursor:
+ id = row[0]
+ fields = [E.field(kind, name='kind'), E.field('%s:%s' % (kind, id), name='id')]
+ for name, value in zip(names, row[1:]):
+ if not value:
+ continue
+ if isinstance(value, str):
+ value = value.decode('utf8')
+ elif not isinstance(value, unicode):
+ value = unicode(value)
+ try:
+ fields.append(E.field(value, name=name))
+ except ValueError:
+ continue # XXX
+ yield id, fields
def iter_sub(db, kind, subtable, ids=()):
@@ -290,7 +299,7 @@ def iter_sub(db, kind, subtable, ids=()):
if table not in tables:
joins.append(table)
tables.add(table)
- columns.append('%s.%s' % (table, kind))
+ columns.append('%s.%s' % (table, column.backref or kind))
else:
foreign_table = table + '__' + last_column.name + '__' + column.table
if foreign_table not in tables:
@@ -307,31 +316,31 @@ def iter_sub(db, kind, subtable, ids=()):
names.append(field.name)
query = generate_iter_query(columns, joins, ids)
- with closing(db.cursor('cursor_' + kind + '_' + subtable)) as cursor:
- cursor.itersize = 100 * 1000
- cursor.execute(query, ids)
- fields = []
- last_id = None
- for row in cursor:
- id = row[0]
- if last_id != id:
- if fields:
- yield last_id, fields
- last_id = id
- fields = []
- for name, value in zip(names, row[1:]):
- if not value:
- continue
- if isinstance(value, str):
- value = value.decode('utf8')
- elif not isinstance(value, unicode):
- value = unicode(value)
- try:
- fields.append(E.field(value, name=name))
- except ValueError:
- continue # XXX
- if fields:
- yield last_id, fields
+ cursor = db.cursor('cursor_' + kind + '_' + subtable)
+ cursor.itersize = 100 * 1000
+ cursor.execute(query, ids)
+ fields = []
+ last_id = None
+ for row in cursor:
+ id = row[0]
+ if last_id != id:
+ if fields:
+ yield last_id, fields
+ last_id = id
+ fields = []
+ for name, value in zip(names, row[1:]):
+ if not value:
+ continue
+ if isinstance(value, str):
+ value = value.decode('utf8')
+ elif not isinstance(value, unicode):
+ value = unicode(value)
+ try:
+ fields.append(E.field(value, name=name))
+ except ValueError:
+ continue # XXX
+ if fields:
+ yield last_id, fields
def placeholders(ids):
@@ -353,7 +362,7 @@ def merge(main, *extra):
if extra_item[0] == id:
fields.extend(extra_item[1])
current[i] = grab_next(extra[i])
- yield E.doc(*fields)
+ yield id, E.doc(*fields)
def fetch_entities(db, kind, ids=()):
@@ -400,6 +409,26 @@ def fetch_all(cfg, db):
fetch_labels(db) if cfg.solr.index_labels else [])
+def fetch_all_updated(cfg, db):
+ queue = cfg.schema.name("mbslave") + ".mbslave_solr_queue"
+ updated = {}
+ cursor = db.cursor()
+ cursor.execute("SELECT id, entity_type, entity_id FROM " + queue)
+ for id, kind, entity_id in cursor:
+ if kind not in updated:
+ updated[kind] = set()
+ db.cursor().execute("DELETE FROM " + queue + " WHERE id = %s", (id,))
+ updated[kind].add(entity_id)
+ for kind, ids in updated.iteritems():
+ if getattr(cfg.solr, 'index_%ss' % kind):
+ missing = set(ids)
+ for id, doc in fetch_entities(db, kind, list(ids)):
+ missing.remove(id)
+ yield E.add(doc)
+ if missing:
+ yield E.delete(*map(E.id, ['%s:%s' % (kind, id) for id in missing]))
+
+
class SolrReplicationHook(ReplicationHook):
def __init__(self, cfg, db, schema):
View
9 solr/conf/schema.xml
@@ -19,6 +19,9 @@
<fields>
+ <!-- Internal ID: "<kind>:<rowid>" -->
+ <field name="id" type="string" indexed="true" stored="true" required="true" />
+
<!--
Type of the entity this document stores. Can be one of:
- artist
@@ -30,8 +33,8 @@
-->
<field name="kind" type="string" indexed="true" stored="true" required="true" />
- <!-- For most entity types, it's just the MBID. It must uniquely identify the document. -->
- <field name="id" type="string" indexed="true" stored="true" required="true" />
+ <!-- MusicBrainz unique ID. This field includes also merged MBIDs -->
+ <field name="mbid" type="string" indexed="true" stored="true" required="true" multiValued="true" />
<!-- Artist name, label name, release group title, recording or recording title. -->
<field name="name" type="text" indexed="true" />
@@ -49,7 +52,7 @@
<field name="artist" type="text" indexed="true" multiValued="true" />
<!-- Artist type, label type, work type. -->
- <field name="type" type="text" indexed="true" />
+ <field name="type" type="text" indexed="true" multiValued="true" />
<!-- Release status. -->
<field name="status" type="text" indexed="true" />
View
2 solr/conf/solrconfig.xml
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8" ?>
<config>
- <luceneMatchVersion>LUCENE_35</luceneMatchVersion>
+ <luceneMatchVersion>LUCENE_40</luceneMatchVersion>
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>
<updateHandler class="solr.DirectUpdateHandler2" />
View
6 sql-extra/solr-queue.sql
@@ -0,0 +1,6 @@
+CREATE TABLE mbslave_solr_queue (
+ id serial NOT NULL PRIMARY KEY,
+ entity_type text NOT NULL,
+ entity_id int NOT NULL
+);
+

0 comments on commit abfc548

Please sign in to comment.
Something went wrong with that request. Please try again.