# Task 1: Using RLTK to perform Entity Resolution (ER)

In [1]:
!pip install rltk



### Task 1.1. Construct RLTK Datasets

In [8]:
import rltk
import csv
import re
import unicodedata

In [3]:
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

#### Principles for selecting good attributes:
1. Does this attribute distinguish entities?
2. Does this attribute change over time?
3. How often is this attribute missing?

In [4]:
def normalize_text(text):
    if not text:
        return ''

    # Unicode normalization
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore').decode('ascii')

    text = text.lower()
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [14]:
from datetime import datetime, date

MONTH_FIX = {
    'jan': 'january', 'feb': 'february', 'mar': 'march',
    'apr': 'april', 'may': 'may', 'jun': 'june',
    'jul': 'july', 'aug': 'august', 'sep': 'september',
    'oct': 'october', 'nov': 'november', 'dec': 'december'
}

def normalize_publish_date(raw_date):

    if not raw_date:
        return None

    raw_date = raw_date.strip().lower()

    # remove ordinal suffixes: 1st, 2nd, 3rd, 28th
    raw_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', raw_date)

    # handle formats "Jun-14"
    if re.match(r'^[a-z]{3}-\d{2}$', raw_date):
        month, year = raw_date.split('-')
        month = MONTH_FIX.get(month, month)
        year = int(year)
        year += 2000 if year < 30 else 1900
        return date(year, datetime.strptime(month, '%B').month, 1)

    # try known formats
    formats = [
        "%B %d %Y",     # August 1 2000
        "%B %Y",        # August 2000
        "%m/%d/%y",     # 12/12/14
        "%m/%d/%Y",     # 12/12/2014
        "%Y"            # 1966
    ]

    for fmt in formats:
        try:
            parsed = datetime.strptime(raw_date, fmt)
            return parsed.date()
        except ValueError:
            continue

    return None

In [15]:
class GoodRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def isbn(self):
        return self.raw_object.get('ISBN13')

    @rltk.cached_property
    def title(self):
        return self.raw_object.get('Title', '')

    @rltk.cached_property
    def title_norm(self):
        return normalize_text(self.title)

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title_norm))

    @rltk.cached_property
    def author(self):
        return self.raw_object.get('FirstAuthor', '')

    @rltk.cached_property
    def publication_date(self):
        return normalize_publish_date(
            self.raw_object.get('PublishDate')
        )

    @rltk.cached_property
    def publication_year(self):
        if self.publication_date:
            return self.publication_date.year
        return None

In [16]:
class NobleRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def isbn(self):
        return self.raw_object.get('ISBN13')

    @rltk.cached_property
    def title(self):
        return self.raw_object.get('Title', '')

    @rltk.cached_property
    def title_norm(self):
        return normalize_text(self.title)

    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title_norm))

    @rltk.cached_property
    def author(self):
        return self.raw_object.get('Author1', '')

    @rltk.cached_property
    def publication_date(self):
        return normalize_publish_date(
            self.raw_object.get('PublicationDate')
        )

    @rltk.cached_property
    def publication_year(self):
        if self.publication_date:
            return self.publication_date.year
        return None

In [17]:
dir_ = ''
good_file = dir_ + 'goodreads.csv'
noble_file = dir_ + 'barnes_and_nobles.csv'

ds1 = rltk.Dataset(rltk.CSVReader(good_file),record_class=GoodRecord)
ds2 = rltk.Dataset(rltk.CSVReader(noble_file),record_class=NobleRecord)

In [24]:
# sanity check on dates and year
from collections import Counter
def sanity_check(csv_file, date_column, sample_size=10):
    parsed = 0
    empty = 0
    failed = 0
    years = Counter()
    failed_examples = []

    with open(csv_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            raw = row.get(date_column, '').strip()

            if not raw:
                empty += 1
                continue

            parsed_date = normalize_publish_date(raw)
            if parsed_date:
                parsed += 1
                years[parsed_date.year] += 1
            else:
                failed += 1
                if len(failed_examples) < sample_size:
                    failed_examples.append(raw)

    print(f"\nFile: {csv_file}")
    print(f"Parsed dates: {parsed}")
    print(f"Empty cells: {empty}")
    print(f"Failed parses: {failed}")

    if years:
        print("Most common years:", years.most_common(5))
    else:
        print("No Examples of most common years:")

    if failed_examples:
        print("Examples of failed values:")
        for v in failed_examples:
            print("  -", v)
    else:
        print("No Examples of failed values.")

sanity_check(good_file, "PublishDate")
sanity_check(noble_file, "PublicationDate")


File: goodreads.csv
Parsed dates: 3400
Empty cells: 564
Failed parses: 3
Most common years: [(2012, 183), (2013, 165), (2011, 155), (2010, 154), (2014, 150)]
Examples of failed values:
  - 30
  - March 10th 11
  - March 10th 11

File: barnes_and_nobles.csv
Parsed dates: 3100
Empty cells: 601
Failed parses: 0
Most common years: [(2015, 456), (2013, 336), (2014, 275), (2012, 245), (2011, 224)]
No Examples of failed values.


In [20]:
# print(ds1.generate_dataframe().head(5))
print(ds2.generate_dataframe().head(5))

  id           isbn                                              title  \
0  0  9780984504176          Pioneer Girl: The Annotated Autobiography   
1  1  9780062376336  American Sniper (Movie Tie-in Edition): The Au...   
2  2  9780345350688                     The Autobiography of Malcolm X   
3  3  9781556520747                           Assata: An Autobiography   
4  4  9780876120798                            Autobiography of a Yogi   

                                          title_norm  \
0           pioneer girl the annotated autobiography   
1  american sniper movie tie in edition the autob...   
2                     the autobiography of malcolm x   
3                            assata an autobiography   
4                            autobiography of a yogi   

                                        title_tokens                 author  \
0     {girl, pioneer, the, autobiography, annotated}   Laura Ingalls Wilder   
1  {of, lethal, s, tie, edition, u, most, history...        

### Task 1.2. Blocking