In [2]:
import json

relations = json.load(open('relations_representations.json'))
relations_middle = json.load(open('relations_representations_middle.json'))
relations_examples_text = json.load(open('relations_representations_text.json'))

In [3]:
relations.keys()

dict_keys(['adjoins', 'author', 'capital', 'contains', 'film_performance', 'founders', 'genre', 'has_sibling', 'has_spouse', 'is_a', 'nationality', 'parents', 'place_of_birth', 'place_of_death', 'profession', 'worked_at'])

In [4]:
def print_number_observations(relations_examples_dict:dict):
    print('Number of observations for relation:\n')
    for rel in relations_examples_dict:
        i = 0
        for ex in relations_examples_dict[rel]:
            i += len(relations_examples_dict[rel][ex])
        print(f'- {rel}: {i}')
        
print_number_observations(relations)

Number of observations for relation:

- adjoins: 1414
- author: 1450
- capital: 298
- contains: 7123
- film_performance: 2074
- founders: 1023
- genre: 297
- has_sibling: 1653
- has_spouse: 1986
- is_a: 1243
- nationality: 940
- parents: 885
- place_of_birth: 692
- place_of_death: 514
- profession: 676
- worked_at: 829


In [5]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

def _generate_data_for_relation(relations_examples_dict:dict):
    X = []
    Y = []
    for rel in relations_examples_dict:
        for ex in relations_examples_dict[rel]:
            X.extend(relations_examples_dict[rel][ex])
            Y.append(rel)
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

def generate_splits(X:np.array, y:np.array, test_size=float):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    lb = preprocessing.LabelBinarizer()
    y_train_bin = lb.fit_transform(y_train)
    y_test_bin = lb.fit_transform(y_test)
    return X_train, X_test, y_train_bin, y_test_bin, lb.classes_

def _generate_logic_features(relations_examples_dict_text:dict):
    X_text = []
    for key in relations_examples_dict_text:
        text = list(map(lambda x:relations_examples_dict_text[key][x], 
                        relations_examples_dict_text[key].keys()))
        X_text.extend(text)

    X_logic = []
    for i in range(len(X_text)):
        and_ = 0
        or_ = 0
        but_ = 0
        for j in range(len(X_text[i])):
            if X_text[i][j] == 'and':
                and_ += 1
            elif X_text[i][j] == 'or':
                or_ += 1
            elif X_text[i][j] == 'not':
                but_ += 1
        X_logic.append([and_, or_, but_])
        
    return np.array(X_logic)

def _generate_random_features(relations_examples_dict_text:dict):
    X_text = []
    for key in relations_examples_dict_text:
        text = list(map(lambda x:relations_examples_dict_text[key][x], 
                        relations_examples_dict_text[key].keys()))
        X_text.extend(text)

    X_logic = []
    for i in range(len(X_text)):
        and_ = 0
        or_ = 0
        but_ = 0
        for j in range(len(X_text[i])):
            if X_text[i][j] == ',':
                and_ += 1
            elif X_text[i][j] == 'km':
                or_ += 1
            elif X_text[i][j] == 'She':
                but_ += 1
        X_logic.append([and_, or_, but_])
        
    return np.array(X_logic)

In [6]:
all_text = [relations_examples_text[i][j] for i in relations_examples_text
            for j in relations_examples_text[i]]

In [7]:
from nltk import word_tokenize

tokens = []
for i in all_text:
    for j in i:
        tokens.extend(word_tokenize(j))
        
from nltk import FreqDist

frequencies = FreqDist(tokens)

In [8]:
print(frequencies['and'])
print(frequencies['or'])
print(frequencies['not'])

27290
472
119


In [362]:
frequencies.most_common(200)

[(',', 53475),
 ('and', 27290),
 ('the', 18076),
 ('of', 12768),
 ('in', 10781),
 ('.', 6885),
 ('(', 5942),
 ('by', 4897),
 ('to', 4695),
 ('is', 3723),
 (')', 3638),
 ('a', 3329),
 ("'s", 2474),
 ('from', 2437),
 ('was', 2077),
 ('at', 1921),
 ('on', 1766),
 ('as', 1683),
 ('with', 1560),
 ('his', 1551),
 ('It', 1428),
 ('The', 1254),
 ('s', 997),
 ('film', 904),
 ('wife', 890),
 ('north', 873),
 ('located', 871),
 ('[', 858),
 (']', 834),
 (';', 814),
 ('south', 784),
 ('for', 755),
 ('west', 706),
 ('``', 678),
 ('son', 671),
 ('an', 669),
 ('born', 659),
 ('who', 639),
 ('east', 626),
 ('‘', 590),
 ('city', 587),
 ('capital', 583),
 (':', 565),
 ('-', 563),
 ('’', 559),
 ('between', 559),
 ('her', 541),
 ('near', 537),
 ('km', 527),
 ('after', 502),
 ('northern', 487),
 ('or', 472),
 ('which', 470),
 ('brother', 464),
 ('including', 463),
 ('southern', 459),
 ('first', 459),
 ('also', 455),
 ('it', 455),
 ('are', 434),
 ('its', 424),
 ('he', 395),
 ('that', 392),
 ('family', 380),

In [358]:
frequencies.keys()

dict_keys([',', 'Sweden', 'nor', 'Ireland', 'and', '[', '26', ']', 'northern', 'extending', 'south', 'into', 'Athens', 'the', 'United', 'States', 'Italy', 'Kingdom', 'until', 'amnesty', 'of', '1847', '.', 'He', 'then', 'returned', 'to', 'Mediterranean', 'countries', 'like', 'Germany', 'England', 'from', 'Hundred', 'Years', '’', 'War', 'Britain', 'French-speaking', 'world', 'Russia', '67,785', '(', '10.33', '%', ')', 'USA', '63,275', '9.64', 'Belgium', 'Catalonia', 'which', 'lies', 'in', 'eastern', '45', 'production', 'also', 'Portugal', 'Loterías', 'y', 'Apuestas', 'del', 'Estado', 'Poland', 'Greece', 'back', 'Holland', 'loup-garou', 'lycanthropos', 'Benelux', 'Israel', 'accounting', 'for', 'PCF', 'but', 'Liber-T', 'system', 'Telepass', 'Netherlands', 'US', 'China', 'allies', 'with', 'UK', 'Senegal', 'EU', ';', 'on', 'many', 'other', 'places', 'times', 'ETA', 'is', 'considered', 'by', 'Romania', 'i', 'north', 'Switzerland', 'obscure', 'Britonia', 'what', 'now', 'Galicia', 'or', 'edit',

In [22]:
len(relations_middle['adjoins']["('France', 'Spain')"][0])

3072

In [376]:
X, y = _generate_data_for_relation(relations_middle)
X_logic = _generate_logic_features(relations_examples_text)
X_random = _generate_random_features(relations_examples_text)
X_logic_rel = np.append(X, X_logic, axis=1)
X_random_rel = np.append(X, X_random, axis=1)
X_train, X_test, y_train, y_test, classes = generate_splits(X, y, 0.2)
X_train_random, X_test_random, y_train_random, y_test_random, classes = generate_splits(X_random, y, 0.2)
X_train_logic, X_test_logic, y_train_logic, y_test_logic, classes = generate_splits(X_logic, y, 0.2)
X_train_random_rel, X_test_random_rel, y_train_random_rel, y_test_random_rel, classes = generate_splits(X_random_rel, y, 0.2)
X_train_logic_rel, X_test_logic_rel, y_train_logic_rel, y_test_logic_rel, classes = generate_splits(X_logic_rel, y, 0.2)

In [307]:
# from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as logit
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

class Model:
    def __init__(self, train_data, test_data, classes, featurizer=None, parameters=None):
        self.classifier = []
        self.featurizer = featurizer
        self.X_train = train_data[0]
        self.y_train = train_data[1]
        self.X_test = test_data[0]
        self.y_test = test_data[1]
        self.parameters = parameters
        self.predictions = []
        self.metrics = {}
        self.classes = classes
        
    def fit(self, algorithm):
        algorithm = algorithm()
        for i in range(len(self.classes)):
            classifier = algorithm.fit(self.X_train, self.y_train[:,i])
            self.classifier.append(classifier)
            y_predict = classifier.predict(self.X_test)
            mets = metrics.classification_report(y_predict, self.y_test[:,i], output_dict=True)
#            print(metrics.classification_report(y_predict, self.y_test[:,i]))
            self.metrics[self.classes[i]] = mets
        print('Done!')
    
    def generate_metrics(self):
        for i in range(len(self.classes)):
            y_predict = self.classifier[i].predict(self.X_test)
            print(self.classes[i])
            print(np.sum(self.y_test[:,i]))
            mets = metrics.classification_report(y_predict, self.y_test[:,i], output_dict=True)
            print(metrics.classification_report(y_predict, self.y_test[:,i]))
            self.metrics[self.classes[i]] = mets
        print('Done!')

In [377]:
#mdl = Model((X_train, y_train), (X_test, y_test), classes)
mdlLogic = Model((X_train_logic, y_train_logic), (X_test_logic, y_test_logic), classes)
mdlRandom = Model((X_train_logic, y_train_logic), (X_test_logic, y_test_logic), classes)
#mdlRandomRel = Model((X_train_random_rel, y_train_random_rel), (X_test_random_rel, y_test_random_rel), classes)
#mdlLogicRel = Model((X_train_logic_rel, y_train_logic_rel), (X_test_logic_rel, y_test_logic_rel), classes)

In [326]:
mdl.fit(logit)



Done!


In [327]:
print(_precision(mdl))
print(_f1(mdl))
_print_scores(mdl)

0.7202818982191337
0.7435506339543294
adjoins 	 	 precision: 0.748670466395682 	 	 f1: 0.7814925675068181
author 	 	 precision: 0.9059095741566752 	 	 f1: 0.9312601671469949
capital 	 	 precision: 0.5196959617262806 	 	 f1: 0.5228278338036199
contains 	 	 precision: 0.8566609010789885 	 	 f1: 0.8590247509339974
film_performance 	 	 precision: 0.8964526484658006 	 	 f1: 0.9004292681124562
founders 	 	 precision: 0.8475239859053937 	 	 f1: 0.8661451542807475
genre 	 	 precision: 0.6282060641456173 	 	 f1: 0.6699461772067952
has_sibling 	 	 precision: 0.6953350571809813 	 	 f1: 0.7409164792224211
has_spouse 	 	 precision: 0.7284848909766529 	 	 f1: 0.773019773019773
is_a 	 	 precision: 0.6372461480958511 	 	 f1: 0.6460002245942795
nationality 	 	 precision: 0.7178678297199639 	 	 f1: 0.7341388887483622
parents 	 	 precision: 0.8352269699258187 	 	 f1: 0.86565881224858
place_of_birth 	 	 precision: 0.6752228421893577 	 	 f1: 0.7126117780363814
place_of_death 	 	 precision: 0.59726131344867

In [378]:
mdlLogic.fit(logit)

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Done!


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [379]:
print(_precision(mdlLogic))
print(_f1(mdlLogic))
_print_scores(mdlLogic)

0.5037767751737507
0.4898635005990504
adjoins 	 	 precision: 0.5607829288967682 	 	 f1: 0.5915684129223004
author 	 	 precision: 0.5 	 	 f1: 0.48379888268156424
capital 	 	 precision: 0.5 	 	 f1: 0.4967868423918963
contains 	 	 precision: 0.5 	 	 f1: 0.40708418891170434
film_performance 	 	 precision: 0.5 	 	 f1: 0.4755363832444091
founders 	 	 precision: 0.5 	 	 f1: 0.48922056384742946
genre 	 	 precision: 0.5 	 	 f1: 0.49738903394255873
has_sibling 	 	 precision: 0.5 	 	 f1: 0.48246891452895707
has_spouse 	 	 precision: 0.4996454738832427 	 	 f1: 0.4778481012658228
is_a 	 	 precision: 0.5 	 	 f1: 0.4861528194861528
nationality 	 	 precision: 0.5 	 	 f1: 0.49096518289995594
parents 	 	 precision: 0.5 	 	 f1: 0.48910759703638174
place_of_birth 	 	 precision: 0.5 	 	 f1: 0.49230769230769234
place_of_death 	 	 precision: 0.5 	 	 f1: 0.4943635766663019
profession 	 	 precision: 0.5 	 	 f1: 0.4920844327176781
worked_at 	 	 precision: 0.5 	 	 f1: 0.49113338473400153


In [317]:
mdlLogicRel.fit(logit)



Done!


In [324]:
print(_precision(mdlLogicRel))
print(_f1(mdlLogicRel))
_print_scores(mdlLogicRel)

0.7265346025141752
0.748537385240936
adjoins 	 	 precision: 0.7901063337956542 	 	 f1: 0.8171479310407133
author 	 	 precision: 0.9048229293365888 	 	 f1: 0.9253366476822793
capital 	 	 precision: 0.5249306468097532 	 	 f1: 0.5319401445467357
contains 	 	 precision: 0.8740831802224934 	 	 f1: 0.8751889350298847
film_performance 	 	 precision: 0.8983120942427052 	 	 f1: 0.9029599895683187
founders 	 	 precision: 0.8576448479851198 	 	 f1: 0.8787791241434822
genre 	 	 precision: 0.6311663191395962 	 	 f1: 0.6736931321084865
has_sibling 	 	 precision: 0.685835372453061 	 	 f1: 0.7293546910755149
has_spouse 	 	 precision: 0.7452256944444444 	 	 f1: 0.7804408168643358
is_a 	 	 precision: 0.6593226811094101 	 	 f1: 0.6734854482889694
nationality 	 	 precision: 0.6991248559262662 	 	 f1: 0.7162879745692302
parents 	 	 precision: 0.8121513643901703 	 	 f1: 0.8472554427646612
place_of_birth 	 	 precision: 0.678025135120953 	 	 f1: 0.7024190702719657
place_of_death 	 	 precision: 0.5995346381285

In [380]:
mdlRandom.fit(logit)

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Done!


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [381]:
print(_precision(mdlRandom))
print(_f1(mdlRandom))
_print_scores(mdlRandom)

0.5037767751737507
0.4898635005990504
adjoins 	 	 precision: 0.5607829288967682 	 	 f1: 0.5915684129223004
author 	 	 precision: 0.5 	 	 f1: 0.48379888268156424
capital 	 	 precision: 0.5 	 	 f1: 0.4967868423918963
contains 	 	 precision: 0.5 	 	 f1: 0.40708418891170434
film_performance 	 	 precision: 0.5 	 	 f1: 0.4755363832444091
founders 	 	 precision: 0.5 	 	 f1: 0.48922056384742946
genre 	 	 precision: 0.5 	 	 f1: 0.49738903394255873
has_sibling 	 	 precision: 0.5 	 	 f1: 0.48246891452895707
has_spouse 	 	 precision: 0.4996454738832427 	 	 f1: 0.4778481012658228
is_a 	 	 precision: 0.5 	 	 f1: 0.4861528194861528
nationality 	 	 precision: 0.5 	 	 f1: 0.49096518289995594
parents 	 	 precision: 0.5 	 	 f1: 0.48910759703638174
place_of_birth 	 	 precision: 0.5 	 	 f1: 0.49230769230769234
place_of_death 	 	 precision: 0.5 	 	 f1: 0.4943635766663019
profession 	 	 precision: 0.5 	 	 f1: 0.4920844327176781
worked_at 	 	 precision: 0.5 	 	 f1: 0.49113338473400153


In [374]:
mdlRandomRel.fit(logit)



Done!


In [375]:
print(_precision(mdlRandomRel))
print(_f1(mdlRandomRel))
_print_scores(mdlRandomRel)

0.7279009270889634
0.7481967770877074
adjoins 	 	 precision: 0.7938518574407375 	 	 f1: 0.8118238036585341
author 	 	 precision: 0.9318260869565218 	 	 f1: 0.9435319867884031
capital 	 	 precision: 0.5087474172678929 	 	 f1: 0.5109008788719411
contains 	 	 precision: 0.8646658234834299 	 	 f1: 0.8657639533199533
film_performance 	 	 precision: 0.8957833437926834 	 	 f1: 0.9046375519274183
founders 	 	 precision: 0.8354987622268217 	 	 f1: 0.8542546521394518
genre 	 	 precision: 0.6866479416850573 	 	 f1: 0.7310027740523962
has_sibling 	 	 precision: 0.6887305342385834 	 	 f1: 0.7337012824435974
has_spouse 	 	 precision: 0.738873481856104 	 	 f1: 0.7835258902787167
is_a 	 	 precision: 0.6159863420008667 	 	 f1: 0.6350613017382216
nationality 	 	 precision: 0.712183686363183 	 	 f1: 0.7244959307867145
parents 	 	 precision: 0.850565050091864 	 	 f1: 0.8690226556199135
place_of_birth 	 	 precision: 0.6624382647385985 	 	 f1: 0.6861783327911108
place_of_death 	 	 precision: 0.5930013309671

In [301]:
def _precision(model_class):
    return np.mean([model_class.metrics[i]['macro avg']['precision'] for i in model_class.metrics])
print(_precision(mdl))
print(_precision(mdlLogicRel))

0.7240426344543941
0.7345549019266707


In [318]:
def _f1(model_class):
    return np.mean([ model_class.metrics[i]['macro avg']['f1-score'] for i in model_class.metrics])
#print(_f1(mdl))
print(_f1(mdlLogicRel))
print(_f1(mdlRandomRel))

0.748537385240936
0.7510064412983555


In [303]:
def _print_scores(model_class):
    for rel in model_class.metrics:
        print(f"{rel} \t \t precision: {model_class.metrics[rel]['macro avg']['precision']} \t \t f1: {model_class.metrics[rel]['macro avg']['f1-score']}")

In [304]:
_print_scores(mdlLogicRel)

adjoins 	 	 precision: 0.7806691400801091 	 	 f1: 0.7994937963188216
author 	 	 precision: 0.9304257767548907 	 	 f1: 0.9432558522385323
capital 	 	 precision: 0.530828802782619 	 	 f1: 0.5394200290910314
contains 	 	 precision: 0.8678704157174224 	 	 f1: 0.8714264771525284
film_performance 	 	 precision: 0.8949936334356976 	 	 f1: 0.9066504713979988
founders 	 	 precision: 0.8487296241442452 	 	 f1: 0.8564233515798106
genre 	 	 precision: 0.7377024070021883 	 	 f1: 0.7500618687232024
has_sibling 	 	 precision: 0.6883593004769475 	 	 f1: 0.7378838541066144
has_spouse 	 	 precision: 0.7469529092377929 	 	 f1: 0.7928789531308775
is_a 	 	 precision: 0.6507173382173382 	 	 f1: 0.6650531624276159
nationality 	 	 precision: 0.7296214222210426 	 	 f1: 0.7446119235095613
parents 	 	 precision: 0.8657932282750914 	 	 f1: 0.8905345326716294
place_of_birth 	 	 precision: 0.6617548559946417 	 	 f1: 0.6949263489924664
place_of_death 	 	 precision: 0.5806095327675524 	 	 f1: 0.5943290247227092
profe

In [150]:
X, y = _generate_data_for_relation(relations_middle)
X_train, X_test, y_train, y_test, classes = generate_splits(X, y, 0.2)

In [151]:
mdl_middle = Model((X_train, y_train), (X_test, y_test), classes)

In [156]:
from sklearn.ensemble import GradientBoostingClassifier

mdl_middle.fit(GradientBoostingClassifier)

Done!


In [160]:
X_train.shape

(18477, 3072)

In [241]:
#relations_examples_text['adjoins']["('Poland', 'Czech_Republic')"]


In [246]:
xlog = np.array(X_logic)
xlog.shape

(23097, 3)

In [249]:
np.sum(xlog[:,0])

10855

In [244]:
xtext = np.array(X_text)
xtext.shape

(23097,)