Skip to content

Commit

Permalink
Merge pull request #167 from goingdotin/accent_indexes
Browse files Browse the repository at this point in the history
Unaccent feature that mimics PostgresSQL unaccent function
  • Loading branch information
cblanc committed Jan 26, 2018
2 parents a089e5f + b694232 commit e19ef40
Show file tree
Hide file tree
Showing 6 changed files with 266 additions and 3 deletions.
46 changes: 46 additions & 0 deletions app/lib/unaccent.js
@@ -0,0 +1,46 @@
"use strict";

// A map of Welsh characters containing diacritics that need to be unaccented
// Handles circumflex, grave and acute accents for all vowels (+y)
const accentMap = new Map([
["Ŵ", "W"],
["ŵ", "w"],
["Ô", "O"],
["ô", "o"],
["Ù", "U"],
["ù", "u"],
["À", "A"],
["à", "a"],
["Ì", "I"],
["ì", "i"],
["Ò", "O"],
["ò", "o"],
["Â", "A"],
["â", "a"],
["È", "E"],
["è", "e"],
["Ê", "E"],
["ê", "e"],
["Î", "I"],
["î", "i"],
["Ŷ", "Y"],
["ŷ", "y"],
["Û", "U"],
["û", "u"],
["Á", "A"],
["á", "a"]
]);

// Mimicking postgres unaccent function
// Necessary because indexes do not work if unaccent function is involved
// https://stackoverflow.com/questions/28899042/unaccent-preventing-index-usage-in-postgres/28899610#28899610
const unaccent = str => {
const strLength = str.length;
for (let i = 0; i < strLength; i++) {
const char = str[i];
if (accentMap.has(char)) str = str.replace(char, accentMap.get(char));
}
return str;
};

module.exports = unaccent;
9 changes: 6 additions & 3 deletions app/models/place.js
Expand Up @@ -14,6 +14,7 @@ const defaults = require(configPath)(env).defaults;
const searchDefaults = defaults.placesSearch;
const nearestDefaults = defaults.placesNearest;
const containsDefaults = defaults.placesContained;
const unaccent = require("../lib/unaccent.js");

const placeSchema = {
"id": "SERIAL PRIMARY KEY",
Expand Down Expand Up @@ -136,14 +137,16 @@ Place.prototype.search = function (options, callback) {
});
};

// Replacing postgres unaccent due to indexing issues
// https://stackoverflow.com/questions/28899042/unaccent-preventing-index-usage-in-postgres/28899610#28899610
const searchQuery = `
SELECT
${returnAttributes}
FROM
places
WHERE
name_1_search ~ unaccent($1)
OR name_2_search ~ unaccent($1)
name_1_search ~ $1
OR name_2_search ~ $1
LIMIT $2
`;

Expand All @@ -161,7 +164,7 @@ const searchQuery = `
* @return {undefined}
*/
Place.prototype._prefixSearch = function (options, callback) {
const regex = `^${escapeRegex(options.name)}.*`;
const regex = `^${unaccent(escapeRegex(options.name))}.*`;
const limit = options.limit;
this._query(searchQuery, [regex, limit], (error, result) => {
if (error) return callback(error);
Expand Down
3 changes: 3 additions & 0 deletions tests/helper/index.js
Expand Up @@ -565,6 +565,9 @@ module.exports = {
listDatabaseIndexes: listDatabaseIndexes,
locationWithNearbyPostcodes: locationWithNearbyPostcodes,

// Libs
unaccent: require("../../app/lib/unaccent.js"),

// Models
Base: Base,
AttributeBase: AttributeBase,
Expand Down
11 changes: 11 additions & 0 deletions tests/place.search.unit.js
Expand Up @@ -253,6 +253,17 @@ describe("Place Model", () => {
done();
});
});
it ("handles non-ascii character prefix searches", done => {
const prefix = "Mynydd-llêc";
const name = "Mynydd-llêch";
Place.search({ name: prefix }, (error, results) => {
if (error) return done(error);
assert.equal(results.length, 1);
results.forEach(helper.isRawPlaceObject);
assert.equal(results[0].name_1, name);
done();
});
});
it ("handles hyphens as spaces", done => {
const name = "Llwyn-y-groes";
Place.search({ name: name }, (error, results) => {
Expand Down
19 changes: 19 additions & 0 deletions tests/remove_accents.unit.js
@@ -0,0 +1,19 @@
"use strict";

const assert = require("chai").assert;
const unaccent = require("./helper").unaccent;
const testData = require("./seed/accent_tests.json");

describe('unaccent', () => {
it('removes diacritics the same way as postgres', () => {
// making sure all lines are read
Object.keys(testData).forEach(accentedString => {
const expectedUnaccentedString = testData[accentedString];
assert.equal(unaccent(accentedString), expectedUnaccentedString);
});
});

it('removes repeated accents', () => {
assert.equal(unaccent('ÀÀ'), 'AA');
});
});
181 changes: 181 additions & 0 deletions tests/seed/accent_tests.json
@@ -0,0 +1,181 @@
{
"Àird": "Aird",
"Dùn Gainmhich": "Dun Gainmhich",
"Àird nan Strùban": "Aird nan Struban",
"Baile Mhic' Phàil": "Baile Mhic' Phail",
"Dìurinis": "Diurinis",
"Baile Mhàrtainn": "Baile Mhartainn",
"Grìminis": "Griminis",
"Mìolabhaig": "Miolabhaig",
"An Àird Dhorcha": "An Aird Dhorcha",
"Crùlabhig": "Crulabhig",
"Dùn Chàrlabhaigh": "Dun Charlabhaigh",
"Ceòs": "Ceos",
"Lìonal": "Lional",
"Eòropaidh": "Eoropaidh",
"Eòradal": "Eoradal",
"Àird a' Mhulaidh": "Aird a' Mhulaidh",
"Àird Asaig": "Aird Asaig",
"Dail Mòr": "Dail Mor",
"Ceann Shìphoirt": "Ceann Shiphoirt",
"Port nan Giùran": "Port nan Giuran",
"Àird Dhail": "Aird Dhail",
"Àird Uig": "Aird Uig",
"Am Blàran Odhar": "Am Blaran Odhar",
"Raon na Crèadha": "Raon na Creadha",
"Cùl-cinn": "Cul-cinn",
"Brèbhig": "Brebhig",
"Steòrnabhagh": "Steornabhagh",
"Bràigh na h-Aoidhe": "Braigh na h-Aoidhe",
"Àird Mhidhinis": "Aird Mhidhinis",
"Crois Dùghaill": "Crois Dughaill",
"Àird Mhòr": "Aird Mhor",
"Pàirceanan": "Pairceanan",
"Àird Mhìghe": "Aird Mhighe",
"Eòlaigearraidh": "Eolaigearraidh",
"Leac a' Lì": "Leac a' Li",
"Geàrraidh na Mònadh": "Gearraidh na Monadh",
"Tobha Mòr": "Tobha Mor",
"Sruth Mòr": "Sruth Mor",
"Àird Mhìghe": "Aird Mhighe",
"Cille Bhrìghde": "Cille Bhrighde",
"Athmòr": "Athmor",
"Màraig": "Maraig",
"Dùn Bheagan": "Dun Bheagan",
"Bàgh a' Chàise": "Bagh a' Chaise",
"Bàgh Mòr": "Bagh Mor",
"Àird": "Aird",
"An t-Òrd": "An t-Ord",
"DuisdeiI Mòr": "DuisdeiI Mor",
"Àird Cumhang": "Aird Cumhang",
"Sàsaig": "Sasaig",
"An t-Àth Leathann": "An t-Ath Leathann",
"An Àrd": "An Ard",
"Allt nan Sùgh": "Allt nan Sugh",
"Sròndubh": "Srondubh",
"Fhaighear Mhòir": "Fhaighear Mhoir",
"Lòndubh": "Londubh",
"Drochaid Chàrr": "Drochaid Charr",
"An Gearraidh Mòr": "An Gearraidh Mor",
"Àird Tobha": "Aird Tobha",
"Cu' Dhèis": "Cu' Dheis",
"Àth-Tharracail": "Ath-Tharracail",
"Brèibhig": "Breibhig",
"Baile Mòr": "Baile Mor",
"Bàgh a'Chaisteil": "Bagh a'Chaisteil",
"Port Mòr": "Port Mor",
"An Saìlean": "An Sailean",
"An Cárn Dubh": "An Carn Dubh",
"Ceann Gheàrr Loch": "Ceann Ghearr Loch",
"A' Chrìon Làraich": "A' Chrion Laraich",
"An Fasadh Feàrna": "An Fasadh Fearna",
"Bogh Mòr": "Bogh Mor",
"Àrasaig": "Arasaig",
"Inbhir Nèill": "Inbhir Neill",
"Y Ffôr": "Y Ffor",
"Rhôs-porth-ychain": "Rhos-porth-ychain",
"Rhos-ddû": "Rhos-ddu",
"Rhôs-y-llan": "Rhos-y-llan",
"Tai'n-Iôn": "Tai'n-Ion",
"Pen-Iôn": "Pen-Ion",
"Môrawelon": "Morawelon",
"Pant Glâs": "Pant Glas",
"Plâs Llwyngwern": "Plas Llwyngwern",
"Cae Clŷd": "Cae Clyd",
"Pont Felin-y-ffrîdd": "Pont Felin-y-ffridd",
"Dol-fâch": "Dol-fach",
"Melin-y-ddôl": "Melin-y-ddol",
"Pen-Lôn": "Pen-Lon",
"Llandrillo-yn-Rhôs": "Llandrillo-yn-Rhos",
"Rhôs-on-Sea": "Rhos-on-Sea",
"Felin-hên": "Felin-hen",
"Hên-durnpike": "Hen-durnpike",
"Cefn Côch": "Cefn Coch",
"Ddôl Cownwy": "Ddol Cownwy",
"Waen-fâch": "Waen-fach",
"Pwll-glâs": "Pwll-glas",
"Penmaen Rhôs": "Penmaen Rhos",
"Llanelian-yn-Rhôs": "Llanelian-yn-Rhos",
"Dre-gôch": "Dre-goch",
"Mynydd-llêch": "Mynydd-llech",
"Llannerch-y-môr": "Llannerch-y-mor",
"Rhôs": "Rhos",
"Yr Hôb": "Yr Hob",
"Yr Hôb": "Yr Hob",
"Fron-dêg": "Fron-deg",
"Rhŷd-y-ceirw": "Rhyd-y-ceirw",
"Pant-glâs": "Pant-glas",
"Penarlâg": "Penarlag",
"Y Sgwâr": "Y Sgwar",
"Llys-y-frân": "Llys-y-fran",
"Glan-dŵr": "Glan-dwr",
"The Clôs": "The Clos",
"Cwm-pîb": "Cwm-pib",
"Troed-y-rhiw-Siôn": "Troed-y-rhiw-Sion",
"Pentre-bâch": "Pentre-bach",
"Ffos-y-ffîn": "Ffos-y-ffin",
"Penrhiw-pâl": "Penrhiw-pal",
"Fforest Gôch": "Fforest Goch",
"Pont-Siân": "Pont-Sian",
"Drefâch": "Drefach",
"Nant-y-ffîn": "Nant-y-ffin",
"Aber-Giâr": "Aber-Giar",
"Pant-y-crûg": "Pant-y-crug",
"Parc-y-rhôs": "Parc-y-rhos",
"Dôl-y-Bont": "Dol-y-Bont",
"Penffordd-Lâs": "Penffordd-Las",
"Pibwrlŵyd": "Pibwrlwyd",
"Pant-y-dŵr": "Pant-y-dwr",
"Tre'r-ddôl": "Tre'r-ddol",
"Cwmbâch": "Cwmbach",
"Twyn Bryn-hîr": "Twyn Bryn-hir",
"Tretŵr": "Tretwr",
"Dôl-forwyn": "Dol-forwyn",
"Rhiw-lâs": "Rhiw-las",
"Cwm Siôn Mathew": "Cwm Sion Mathew",
"Pen-tŵyn": "Pen-twyn",
"Pont-y-Gôf": "Pont-y-Gof",
"Lâleston": "Laleston",
"Coytrahên": "Coytrahen",
"Efail-fâch": "Efail-fach",
"Cwm-felin-fâch": "Cwm-felin-fach",
"Clawdd-côch": "Clawdd-coch",
"Gwenfô": "Gwenfo",
"Llandâf": "Llandaf",
"Pont-Siôn-Norton": "Pont-Sion-Norton",
"Pentwyn Berthlŵyd": "Pentwyn Berthlwyd",
"Cwmbrân": "Cwmbran",
"Castell-y-bŵch": "Castell-y-bwch",
"Àird Mhòr": "Aird Mhor",
"Loch a' Chàrnain": "Loch a' Charnain",
"An Àird": "An Aird",
"Tòrabhaig": "Torabhaig",
"Achadh a' Chùirn": "Achadh a' Chuirn",
"Diùranais": "Diuranais",
"Tàbost": "Tabost",
"Brù": "Bru",
"Acha Mòr": "Acha Mor",
"Càrlabhagh": "Carlabhagh",
"Àird Thunga": "Aird Thunga",
"Àird Shleibhe": "Aird Shleibhe",
"Mànais": "Manais",
"An Dùnan": "An Dunan",
"Stròlamas": "Strolamas",
"Cille Bhrìghde": "Cille Bhrighde",
"Port Rìgh": "Port Righ",
"Peighinn nam Fìdhleir": "Peighinn nam Fidhleir",
"Breacais Ìosal": "Breacais Iosal",
"Ceann Loch Iù": "Ceann Loch Iu",
"Tòcabhaig": "Tocabhaig",
"Àird a' Bhasair": "Aird a' Bhasair",
"An t-Àrchar": "An t-Archar",
"Cam Dhàil": "Cam Dhail",
"Cùl na Ceapaich": "Cul na Ceapaich",
"An Loch Geàrr": "An Loch Gearr",
"Dùn Obhainn": "Dun Obhainn",
"Sanclêr": "Sancler",
"Pont-y-pŵl": "Pont-y-pwl",
"Aberdâr": "Aberdar",
"Llanbethêry": "Llanbethery",
"Llansanffraid Gwynllŵg": "Llansanffraid Gwynllwg"
}

0 comments on commit e19ef40

Please sign in to comment.