# Task 1: Using RLTK to perform Entity Resolution (ER)

In [1]:
!pip install rltk



### Task 1.1. Construct RLTK Datasets

In [8]:
import rltk
import csv
import re
import unicodedata

In [3]:
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

#### Principles for selecting good attributes:
1. Does this attribute distinguish entities?
2. Does this attribute change over time?
3. How often is this attribute missing?

In [4]:
def normalize_text(text):
    if not text:
        return ''

    # Unicode normalization
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore').decode('ascii')

    text = text.lower()
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [14]:
from datetime import datetime, date

MONTH_FIX = {
    'jan': 'january', 'feb': 'february', 'mar': 'march',
    'apr': 'april', 'may': 'may', 'jun': 'june',
    'jul': 'july', 'aug': 'august', 'sep': 'september',
    'oct': 'october', 'nov': 'november', 'dec': 'december'
}

def normalize_publish_date(raw_date):

    if not raw_date:
        return None

    raw_date = raw_date.strip().lower()

    # remove ordinal suffixes: 1st, 2nd, 3rd, 28th
    raw_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', raw_date)

    # handle formats "Jun-14"
    if re.match(r'^[a-z]{3}-\d{2}$', raw_date):
        month, year = raw_date.split('-')
        month = MONTH_FIX.get(month, month)
        year = int(year)
        year += 2000 if year < 30 else 1900
        return date(year, datetime.strptime(month, '%B').month, 1)

    # try known formats
    formats = [
        "%B %d %Y",     # August 1 2000
        "%B %Y",        # August 2000
        "%m/%d/%y",     # 12/12/14
        "%m/%d/%Y",     # 12/12/2014
        "%Y"            # 1966
    ]

    for fmt in formats:
        try:
            parsed = datetime.strptime(raw_date, fmt)
            return parsed.date()
        except ValueError:
            continue

    return None

In [15]:
class GoodRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def isbn(self):
        return self.raw_object.get('ISBN13')

    @rltk.cached_property
    def title(self):
        return self.raw_object.get('Title', '')

    @rltk.cached_property
    def title_norm(self):
        return normalize_text(self.title)

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title_norm))

    @rltk.cached_property
    def author(self):
        return self.raw_object.get('FirstAuthor', '')

    @rltk.cached_property
    def publication_date(self):
        return normalize_publish_date(
            self.raw_object.get('PublishDate')
        )

    @rltk.cached_property
    def publication_year(self):
        if self.publication_date:
            return self.publication_date.year
        return None

In [16]:
class NobleRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def isbn(self):
        return self.raw_object.get('ISBN13')

    @rltk.cached_property
    def title(self):
        return self.raw_object.get('Title', '')

    @rltk.cached_property
    def title_norm(self):
        return normalize_text(self.title)

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title_norm))

    @rltk.cached_property
    def author(self):
        return self.raw_object.get('Author1', '')

    @rltk.cached_property
    def publication_date(self):
        return normalize_publish_date(
            self.raw_object.get('PublicationDate')
        )

    @rltk.cached_property
    def publication_year(self):
        if self.publication_date:
            return self.publication_date.year
        return None

In [17]:
dir_ = ''
good_file = dir_ + 'goodreads.csv'
noble_file = dir_ + 'barnes_and_nobles.csv'

ds1 = rltk.Dataset(rltk.CSVReader(good_file),record_class=GoodRecord)
ds2 = rltk.Dataset(rltk.CSVReader(noble_file),record_class=NobleRecord)

In [19]:
print(ds1.generate_dataframe().head(5))
# print(ds2.generate_dataframe().head(5))

  id           isbn                                       title  \
0  0  9780340728567          Managing My Life: My Autobiography   
1  1  9780844627106     I Remember: Sketch for an Autobiography   
2  2  9780712679480              Betty Boothroyd: Autobiography   
3  3  9780725100148  Caddie, A Sydney Barmaid: An Autobiography   
4  4  9780340014684     Nureyev: An Autobiography With Pictures   

                                 title_norm  \
0         managing my life my autobiography   
1    i remember sketch for an autobiography   
2             betty boothroyd autobiography   
3  caddie a sydney barmaid an autobiography   
4    nureyev an autobiography with pictures   

                                      title_tokens           author  \
0              {life, autobiography, my, managing}    Alex Ferguson   
1    {sketch, for, an, remember, i, autobiography}  Boris Pasternak   
2                {betty, autobiography, boothroyd}  Betty Boothroyd   
3  {an, a, caddie, barmaid, sy

### Task 1.2. Blocking