# Task 1: Using RLTK to perform Entity Resolution (ER)

In [1]:
!pip install rltk



### Task 1-1. Construct RLTK Datasets

In [2]:
import rltk
import csv

In [3]:
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

#### Principles for selecting good attributes:
1. Does this attribute distinguish entities?
2. Does this attribute change over time?
3. How often is this attribute missing?

In [None]:
def normalize_text(text):
    if not text:
        return ''

    # Unicode normalization
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore').decode('ascii')

    text = text.lower()
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [7]:
class GoodRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def isbn(self):
        return self.raw_object.get('ISBN13')

    @rltk.cached_property
    def title(self):
        return self.raw_object.get('Title', '')

    @rltk.cached_property
    def title_norm(self):
        return normalize_text(self.title)

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title_norm))

    @rltk.cached_property
    def author(self):
        return self.raw_object.get('FirstAuthor', '')

In [8]:
class NobleRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def isbn(self):
        return self.raw_object.get('ISBN13')

    @rltk.cached_property
    def title(self):
        return self.raw_object.get('Title', '')

    @rltk.cached_property
    def title_norm(self):
        return normalize(self.title)

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title_norm))

    @rltk.cached_property
    def author(self):
        return self.raw_object.get('Author1', '')