Skip to content

Commit

Permalink
Add basic deduping
Browse files Browse the repository at this point in the history
  • Loading branch information
jpatokal committed Jan 7, 2019
1 parent 1d46a4a commit 06fcfcd
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 22 deletions.
79 changes: 68 additions & 11 deletions data/update_airlines.py
Expand Up @@ -9,14 +9,18 @@

import argparse
import codecs
import difflib
import mysql.connector
import sys
import urllib2
from collections import defaultdict
from HTMLParser import HTMLParser
from pprint import pprint

import database_connector

# TODO: Scrape IATA members at https://www.iata.org/about/members/Pages/airline-list.aspx?All=true

class HTMLCleaner(HTMLParser):
def __init__(self):
self.reset()
Expand Down Expand Up @@ -54,15 +58,48 @@ def load_all_airlines(self):

def match(self, wp):
icao, iata, callsign, country = wp['icao'], wp['iata'], wp['callsign'], wp['country']
match = None
dupe = None

# Round 1: Find high-probability matches
if icao and icao in self.of_icao:
for airline in self.of_icao[icao]:
if (iata and airline['iata'] == iata) or airline['callsign'] == callsign or airline['country'] == country:
return airline
if iata and iata in self.of_iata:
match = airline
break
if not match and iata and iata in self.of_iata:
for airline in self.of_iata[iata]:
if airline['callsign'] == callsign or airline['country'] == country:
return airline
return None
match = airline
break

# Round 2: Find potential duplicates
if match and 'iata' in match and match['iata']:
for airline in self.of_iata[match['iata']]:
if airline == match:
continue
# Different countries? Not dupes.
if airline['country'] != match['country']:
continue
# If non-null ICAO codes same, guaranteed dupe; if different, not dupe
if airline['icao'] and match['icao']:
if airline['icao'] == match['icao']:
dupe = airline
else:
continue

# If non-null callsigns same, guaranteed dupe; if different, not dupe
if airline['callsign'] and match['callsign']:
if airline['callsign'].upper() == match['callsign'].upper():
dupe = airline
else:
continue

# Are names very similar?
if difflib.SequenceMatcher(None, airline['name'], match['name']).ratio() > 0.8:
dupe = airline

return match, dupe

def diff(self, of, wp):
fields = {}
Expand All @@ -71,10 +108,16 @@ def diff(self, of, wp):
fields[field] = wp[field]
return fields

def update_from_wp(self, of, wp):
def update_from_wp(self, of, wp, dupe):
if dupe:
self.aldb.deduplicate(of['alid'], dupe['alid'])

fields = self.diff(of, wp)
if fields:
self.aldb.update_from_wp(of['alid'], fields)
return 1
else:
return 0

class AirlineDB(database_connector.DatabaseConnector):
def add_new(self, wp):
Expand All @@ -86,6 +129,10 @@ def update_from_wp(self, alid, fields):
field_string = ', '.join(map(lambda k: "%s='%s'" % (k, fields[k].replace("'", "''")), fields.keys()))
self.safe_execute('UPDATE airlines SET ' + field_string + ' WHERE alid=%s', (alid, ))

def deduplicate(self, main_id, dupe_id):
self.safe_execute('UPDATE flights SET alid=%s WHERE alid=%s;', (main_id, dupe_id, ))
self.safe_execute('DELETE airlines WHERE alid=%s;', (dupe_id, ))

class WikipediaArticle(object):
def __init__(self):
self.cleaner = HTMLCleaner()
Expand Down Expand Up @@ -133,6 +180,10 @@ def clean(self, x):
return None
return x

def pp(airline):
alid = airline['alid'] if 'alid' in airline else 'N/A'
return ('%s (%s/%s, %s)' % (airline['name'], airline['iata'], airline['icao'], alid))

if __name__ == "__main__":
# Needed to allow piping UTF-8 (srsly Python wtf)
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
Expand All @@ -147,22 +198,28 @@ def clean(self, x):
ofa.load_all_airlines()

count = 0
matched = 0
updated = 0
deduped = 0
added = 0
wpa = WikipediaArticle()
for c in xrange(ord('A'), ord('Z')+1):
wpa.load(chr(c))
print "### %s" % chr(c)
for airline in wpa.airlines:
of_airline = ofa.match(airline)
(of_airline, dupe) = ofa.match(airline)
if of_airline:
print "> MATCH %s %s" % (airline, of_airline)
ofa.update_from_wp(of_airline, airline)
updated += 1
print "> MATCH %s == %s" % (pp(airline), pp(of_airline))
matched += 1
updated += ofa.update_from_wp(of_airline, airline, dupe)
if dupe:
print ">> DUPE %s -> %s" % (pp(dupe), pp(of_airline))
deduped += 1
else:
print "= NEW %s" % airline
print "= NEW %s" % pp(airline)
aldb.add_new(airline)
added += 1

count += 1

print "%s new, %s updated, %s total" % (added, updated, count)
print "%s matched with %s updated and %s deduped, %s added, %s total" % (matched, updated, deduped, added, count)
77 changes: 66 additions & 11 deletions data/update_airlines_test.py
@@ -1,19 +1,23 @@
# -*- coding: utf-8 -*-

import update_airlines

import argparse
import mock
import unittest

class UpdateAirlinesTest(unittest.TestCase):

def setUp(self):
self.fake_aldb = mock.Mock(spec=update_airlines.AirlineDB)
self.ofa = update_airlines.OpenFlightsAirlines(self.fake_aldb)
self.of = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
self.ofa.of_iata['AB'] = [self.of]
self.ofa.of_icao['ABC'] = [self.of]
self.indexAirline(self.of)
self.wpa = update_airlines.WikipediaArticle()

def indexAirline(self, airline):
self.ofa.of_iata[airline['iata']].append(airline)
self.ofa.of_icao[airline['icao']].append(airline)

def testClean(self):
for input, output in [
["" , None],
Expand All @@ -33,42 +37,93 @@ def testClean(self):
# ICAO and callsign or country matches
def testExactMatch(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
self.assertEquals(self.ofa.match(wp), self.of)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {})

def testNameChange(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Ahvenanmaa Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
self.assertEquals(self.ofa.match(wp), self.of)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {'name': 'Ahvenanmaa Airlines'})

def testIcaoIataMatch(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ZZZZZ', 'country': 'ZZ'}
self.assertEquals(self.ofa.match(wp), self.of)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {'callsign': 'ZZZZZ'})

def testIcaoCallsignMatch(self):
wp = {'icao': 'ABC', 'iata': 'ZZ', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'ZZ'}
self.assertEquals(self.ofa.match(wp), self.of)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {'iata': 'ZZ'})

def testIcaoCountryMatch(self):
wp = {'icao': 'ABC', 'iata': 'ZZ', 'name': 'Aland Airlines', 'callsign': 'ZZZZZ', 'country': 'AX'}
self.assertEquals(self.ofa.match(wp), self.of)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {'callsign': 'ZZZZZ', 'iata': 'ZZ'})

def testIcaoNotMatch(self):
wp = {'icao': 'ABC', 'iata': 'ZZ', 'name': 'Aland Airlines', 'callsign': 'ZZZZZ', 'country': 'ZZ'}
self.assertEquals(self.ofa.match(wp), None)
self.assertEquals(self.ofa.match(wp), (None, None))

# IATA and callsign matches
def testIataCallsignMatch(self):
wp = {'icao': 'ZZZ', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'ZZ'}
self.assertEquals(self.ofa.match(wp), self.of)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {'icao': 'ZZZ'})

def testIataNotMatch(self):
wp = {'icao': 'ZZZ', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ZZZZZ', 'country': 'ZZ'}
self.assertEquals(self.ofa.match(wp), None)
self.assertEquals(self.ofa.match(wp), (None, None))

# Accepted duplicates
def testExactMatchWithIcaoMatchDupe(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
dupe = {'icao': 'ABC', 'iata': 'AB', 'name': 'Zork Airlines', 'callsign': 'ZORK', 'country': 'AX'}
self.indexAirline(dupe)
self.assertEquals(self.ofa.match(wp), (self.of, dupe))
self.assertEquals(self.ofa.diff(self.of, wp), {})

def testExactMatchWithCallsignDupe(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
dupe = {'icao': '', 'iata': 'AB', 'name': 'Zork Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
self.indexAirline(dupe)
self.assertEquals(self.ofa.match(wp), (self.of, dupe))
self.assertEquals(self.ofa.diff(self.of, wp), {})

def testExactMatchWithNameDupe(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
dupe = {'icao': '', 'iata': 'AB', 'name': 'Åland Airlines', 'callsign': '', 'country': 'AX'}
self.indexAirline(dupe)
self.assertEquals(self.ofa.match(wp), (self.of, dupe))
self.assertEquals(self.ofa.diff(self.of, wp), {})

# Rejected near-dupes
def testExactMatchWithDifferentCountryNonDupe(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
dupe = {'icao': '', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'XA'}
self.indexAirline(dupe)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {})

def testExactMatchWithDifferentCallsignNonDupe(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
dupe = {'icao': '', 'iata': 'AB', 'name': 'Zork Airlines', 'callsign': 'ZORKA', 'country': 'AX'}
self.indexAirline(dupe)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {})

def testExactMatchWithDifferentIcaoNonDupe(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
dupe = {'icao': 'DEF', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
self.indexAirline(dupe)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {})

def testExactMatchWithNameNonDupe(self):
wp = {'icao': 'ABC', 'iata': 'AB', 'name': 'Aland Airlines', 'callsign': 'ALAXA', 'country': 'AX'}
dupe = {'icao': '', 'iata': 'AB', 'name': 'Irrelevant Airlines', 'callsign': '', 'country': 'AX'}
self.indexAirline(dupe)
self.assertEquals(self.ofa.match(wp), (self.of, None))
self.assertEquals(self.ofa.diff(self.of, wp), {})

if __name__ == '__main__':
unittest.main()

0 comments on commit 06fcfcd

Please sign in to comment.