In [1]:
# Load the normalized tables

import pickle
import sys
sys.setrecursionlimit(10000)

with open("normal_tables1_1.txt", "rb") as fp:
    nt1 = pickle.load(fp)
    
with open("normal_tables1_2.txt", "rb") as fp:
    nt2 = pickle.load(fp)
    
with open("normal_tables2_1.txt", "rb") as fp:
    nt3 = pickle.load(fp)
    
with open("normal_tables2_2.txt", "rb") as fp:
    nt4 = pickle.load(fp)

In [6]:
# number of articles containing tables with disambiguated entities
len(nt1) + len(nt2) + len(nt3) + len(nt4)

186

In [7]:
nt2

{'Ab-Soul discography': {'0': {'new_table': [[[], ['LP record']],
    [['Hypertext Transfer Protocol',
      'World Wide Web',
      'Mixtape',
      'Component Object Model',
      'Ab-Soul'],
     ['Music download']],
    [['Hypertext Transfer Protocol',
      'World Wide Web',
      'These Things Happen (G-Eazy album)',
      'Component Object Model',
      'Ab-Soul'],
     ['Music download',
      'Top Dawg Entertainment',
      'Record label',
      'Download',
      'Audio file format']]],
   'old_table': [['Title', 'Album details'],
    ["''Longterm'' [http://www.dubcnn.com/mixtapes/absoul-longterm/ Longterm - Ab-Soul] <small>",
     'Released: January 8, 2009 Format: [Music download] Label: Top Dawg Entertainment'],
    ["''Longterm 2: Lifestyles of the Broke and Almost Famous'' [http://www.dubcnn.com/mixtapes/absoul-longterm2/ Longterm 2: Lifestyles of the Broke and Almost Famous - Ab-Soul] <small>",
     'Released: June 28, 2010 Format: Free download Label: Top Dawg Entertain

In [None]:
# set up SPARQL endpoint for wikidata
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

In [None]:
# resolve Wikidata entity from title

from wikitables.client import Client

client = Client("en")

def getWikidata(title):
    return client.fetch_wikidata(title)

In [None]:
# Features (as suggested by the authors (Emir Munoz & Aidan Hogan) of Wikitables Triples Paper)
# =============
# Table Features
#     (-) 1 Number of rows
#     (-) 2 Number of columns
#     (-) 3 Total relations extracted
# Column Features
#     (+) 4 Potential relations
#     (+) 5 Unique potential relations
#     (+) 6 Entity relatedness (new)
# Predicate Features
#     (+) 7 Normalized unique subject count / Normalized unique object count
# Cell Features
#     (-) 8 Number of entities in subject cell
#     (-) 9 Number of entities in object cell
#     (-) 10 String length in subject cell
#     (-) 11 String length in object cell
# Predicate/Column Features
#     (+) 12 Maximum between Jaro-Walker distance and dice coefficient
#     (+) 13 Number of rows where the relation holds
# Where (+) signifies a positive feature & (-) signifies a negative feature

In [None]:
# Predicate Features

# subject and object must be prefixed with "wd:"
def getPredicates(subject,obj):
    sparql.setQuery("""SELECT * WHERE
    {
         %s ?p %s .
         FILTER(STRSTARTS(str(?p), "http://www.wikidata.org/prop/direct/"))
         SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
    }""" % (subject, obj))
    
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    predicates = list()

    for row in results["results"]["bindings"]:
        if row["p"]["type"] == "uri":
            predicates.append(row["p"]["value"])

    return predicates

# (+) 7 Normalized unique subject count / Normalized unique object count\

In [None]:
# Predicate/ Column Features
# (+) 12 Max of dice coeffient and jaro-winkler distance
from pyjarowinkler import distance

def dice_coefficient(a,b):
    if not len(a) or not len(b): return 0.0
    """ quick case for true duplicates """
    if a == b: return 1.0
    """ if a != b, and a or b are single chars, then they can't possibly match """
    if len(a) == 1 or len(b) == 1: return 0.0
    
    """ use python list comprehension, preferred over list.append() """
    a_bigram_list = [a[i:i+2] for i in range(len(a)-1)]
    b_bigram_list = [b[i:i+2] for i in range(len(b)-1)]
    
    a_bigram_list.sort()
    b_bigram_list.sort()
    
    # assignments to save function calls
    lena = len(a_bigram_list)
    lenb = len(b_bigram_list)
    # initialize match counters
    matches = i = j = 0
    while (i < lena and j < lenb):
        if a_bigram_list[i] == b_bigram_list[j]:
            matches += 2
            i += 1
            j += 1
        elif a_bigram_list[i] < b_bigram_list[j]:
            i += 1
        else:
            j += 1
    
    score = float(matches)/float(lena + lenb)
    return score

def feature11(string1, string2):
    return max(distance.get_jaro_distance(string1, string2, winkler=True, scaling=0.1),\
              dice_coefficient(string1, string2))

# (+) 12 No of rows that contain the subject and object
def feature12(predicate):
    sparql.setQuery("""SELECT * WHERE
    {
         ?s %s ?o .
         FILTER(STRSTARTS(str(?s), "http://www.wikidata.org/entity/"))
         FILTER(STRSTARTS(str(?o), "http://www.wikidata.org/entity/"))
         SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
    }""" % (predicate))
    
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    return len(results["results"]["bindings"])

In [None]:
# functions entitiy relatedness using API call
import tagme

with open("tagme", 'r') as file:
    token = file.readline().strip()
    
tagme.GCUBE_TOKEN = token

# Get relatedness between a pair of entities specified by title.
# rels = tagme.relatedness_title(("Barack Obama", "Italy"))
# print("Obama and italy have a semantic relation of", rels.relatedness[0].rel)

In [None]:
import csv

# now that table has been recreated with only disambiguated entities
# let the magic happen
# extract all the triples and features

def addTripleCSV(d, file, mode):
    with open(file, mode, newline='') as csvfile:
        fieldnames = ['id', 'subject', 'predicate','object']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if mode == 'w':
            writer.writeheader()

        writer.writerow(d)