In [1]:
import rltk

In [2]:
import pandas as pd

In [3]:
imdb_df = pd.read_csv('imdb.csv', sep=', ', encoding='utf-8', engine='python',
                      error_bad_lines=False, warn_bad_lines=False, dtype=str, na_filter=False).fillna("")

In [4]:
tmd_df = pd.read_csv('tmd.csv', sep=',', encoding='utf-8', 
                      error_bad_lines=False, warn_bad_lines=False, dtype=str, na_filter=False).fillna("")

In [5]:
label_df = pd.read_csv('labeled_data.csv', sep=',', encoding='utf-8', comment='#',
                      error_bad_lines=False, warn_bad_lines=False, dtype=str, na_filter=False).fillna("")

#### Choose at least 3 attributes to make dataset

In [6]:
class IMDBRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''
    
    @rltk.cached_property
    def id(self):
        return self.raw_object['ID'].lstrip('0')
    
    @rltk.cached_property
    def title(self):
        return self.raw_object['name']
    
    @rltk.cached_property
    def year(self):
        return self.raw_object['year']
    
    @rltk.cached_property
    def directors(self):
        return self.raw_object['director'].split('; ')
    
    @rltk.cached_property
    def writers(self):
        return self.raw_object['writers'].split('; ')
    
    @rltk.cached_property
    def actors(self):
        return self.raw_object['actors'].split('; ')

In [7]:
imdb_ds = rltk.Dataset(rltk.DataFrameReader(imdb_df), record_class=IMDBRecord)

In [8]:
class TMDRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''
    
    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def title(self):
        return self.raw_object['title'].strip()
    
    @rltk.cached_property
    def year(self):
        return self.raw_object['year'].strip()
    
    @rltk.cached_property
    def directors(self):
        return self.raw_object['director(s)'].split(';')
    
    @rltk.cached_property
    def writers(self):
        return self.raw_object['writer(s)'].split(';')
    
    @rltk.cached_property
    def actors(self):
        return self.raw_object['actor(s)'].split(';')

In [9]:
tmd_ds = rltk.Dataset(rltk.DataFrameReader(tmd_df), record_class=TMDRecord)

#### Design a blocking techinque and save output to file

In [10]:
import csv

In [11]:
bg = rltk.HashBlockGenerator()
block = bg.generate(
    bg.block(imdb_ds, property_='year'),
    bg.block(tmd_ds, property_='year')
)

In [12]:
total_pairs = len(imdb_df)*len(tmd_df)
print(total_pairs)

97286562


In [13]:
true_matches_count = 0
true_matches = []
for i in range(len(label_df)):
    if label_df.iloc[i]['class_label'] == '1':
        true_matches_count += 1
        true_matches.append((label_df.iloc[i]['ltable.ID'], label_df.iloc[i]['rtable.ID']))

print(true_matches_count)
# print(true_matches)

256


In [14]:
cmp_pairs = 0
pair_count = 0

with open('Jimi_Cao_hw03_blocked.csv', 'w') as fp:
    blocking_writer = csv.writer(fp)
    
    all_pairs = rltk.get_record_pairs(imdb_ds, tmd_ds, block=block)
    for imdb_rec, tmd_rec in all_pairs:
        cmp_pairs += 1
    
        if imdb_rec.id and tmd_rec.id:
            blocking_writer.writerow([imdb_rec.id, tmd_rec.id])
            pair = (imdb_rec.id, tmd_rec.id)
            if pair in true_matches:
                pair_count += 1
            
print(pair_count)
print(cmp_pairs)

254
3539061


###### Reduction Ratio

In [15]:
red_ratio = 1-(cmp_pairs/total_pairs)
print(red_ratio)

0.9636223037668861


###### Pairs completeness

In [16]:
pair_complete = pair_count/true_matches_count
print(pair_complete)

0.9921875


###### Define scoring function

In [17]:
def score(imd_rec, tmd_redc):
    title_score = rltk.jaro_winkler_similarity(imdb_rec.title, tmd_rec.title)
    
    most_sim = []
    for imdb_director in imdb_rec.directors:
        sim = []
        for tmd_director in tmd_rec.directors:
            sim.append(rltk.jaro_distance(imdb_director, tmd_director))
        most_sim.append(max(sim))
    director_score = sum(most_sim)/len(most_sim)
    
    conf = 0.7 * title_score + 0.3 * director_score
    match = 1 if conf > 0.9 else 0
    
    return match, conf
    

###### Predict and export

In [18]:
matches = 0
with open('Jimi_Cao_hw03_el.csv', 'w') as fp:
    predict_writer = csv.writer(fp)
    for imdb_rec, tmd_rec in rltk.get_record_pairs(imdb_ds, tmd_ds, block=block):
        match, conf = score(imdb_rec, tmd_rec)
        predict_writer.writerow([imdb_rec.id, tmd_rec.id, match])
        matches += match
print(matches)

5989


###### Record linkage on Labeled Data, predict and export

In [19]:
gt = rltk.GroundTruth()

for i in range(len(label_df)):
    imdb_id = label_df.iloc[i]['ltable.ID']
    tmd_id = label_df.iloc[i]['rtable.ID']
    
    if label_df.iloc[i]['class_label'] == '1':
        gt.add_positive(imdb_id, tmd_id)
    else:
        gt.add_negative(imdb_id, tmd_id)


In [20]:
trial = rltk.Trial(gt)
count = 0

with open('Jimi_Cao_hw03_el_labeled.csv', 'w') as fp:
    predict_writer = csv.writer(fp)
    for imdb_rec, tmd_rec in rltk.get_record_pairs(imdb_ds, tmd_ds, ground_truth=gt):
        count += 1
        match, conf = score(imdb_rec, tmd_rec)
        predict_writer.writerow([imdb_rec.id, tmd_rec.id, match])
        trial.add_result(imdb_rec, tmd_rec, match, conf)

print(count)

400


###### Report precision, recall, and F1-score

In [21]:
trial.evaluate()
print('Trial statistics based on Ground-Truth from labeled data:')
print('precision:', trial.precision)
print('recall:', trial.recall)
print('f-measure:', trial.f_measure)

Trial statistics based on Ground-Truth from labeled data:
precision: 0.973384030418251
recall: 1.0
f-measure: 0.9865125240847785


###### Create ttl

In [29]:
from rdflib import Graph, URIRef, Literal, XSD, Namespace, RDF, BNode

In [30]:
SCHEMA = Namespace('https://schema.org/')
MYNS = Namespace('http://inf558.org/myfakenamespace#')

In [31]:
my_kg = Graph()

my_kg.bind('myns', MYNS)
my_kg.bind('schema', SCHEMA)

count = 0
for record in imdb_ds:
    count += 1
    
    # make uri
    movie_uri = URIRef(MYNS[str(count)])
    
    # start schema
    my_kg.add((movie_uri, RDF.type, SCHEMA['Movie']))
    
    # add name
    my_kg.add((movie_uri, SCHEMA['name'], Literal(record.title)))
    
    # add release year
    my_kg.add((movie_uri, MYNS['releaseYear'], Literal(record.year, datatype=XSD.integer)))
    
    # add directors
    directors = record.directors
    for director in directors:
        bnode = BNode()
        my_kg.add((bnode, RDF.type, SCHEMA['Person']))
        my_kg.add((bnode, SCHEMA['name'], Literal(director)))
        my_kg.add((movie_uri, SCHEMA['director'], bnode))
    
    # add actors
    actors = record.actors
    for actor in actors:
        bnode = BNode()
        my_kg.add((bnode, RDF.type, SCHEMA['Person']))
        my_kg.add((bnode, SCHEMA['name'], Literal(actor)))
        my_kg.add((movie_uri, SCHEMA['actor'], bnode))


In [32]:
pairs = pd.read_csv('Jimi_Cao_hw03_el.csv', names=['IMDB_id', 'TMD_id', 'match'])
tmd_matches = pairs[pairs['match'] == 1]['TMD_id'].unique()

In [33]:
for record in tmd_ds:
    if int(record.id) not in tmd_matches:
        count += 1
    
        # make uri
        movie_uri = URIRef(MYNS[str(count)])
    
        # start schema
        my_kg.add((movie_uri, RDF.type, SCHEMA['Movie']))
    
        # add name
        my_kg.add((movie_uri, SCHEMA['name'], Literal(record.title)))
    
        # add release year
        my_kg.add((movie_uri, MYNS['releaseYear'], Literal(record.year, datatype=XSD.integer)))
    
        # add directors
        directors = record.directors
        for director in directors:
            bnode = BNode()
            my_kg.add((bnode, RDF.type, SCHEMA['Person']))
            my_kg.add((bnode, SCHEMA['name'], Literal(director)))
            my_kg.add((movie_uri, SCHEMA['director'], bnode))

        # add actors
        actors = record.actors
        for actor in actors:
            bnode = BNode()
            my_kg.add((bnode, RDF.type, SCHEMA['Person']))
            my_kg.add((bnode, SCHEMA['name'], Literal(actor)))
            my_kg.add((movie_uri, SCHEMA['actor'], bnode))

In [34]:
my_kg.serialize('Jimi_Cao_hw03_triple.ttl', format="turtle")