developed by Patricia Klinger, modified by Sebastian Gampe

In [1]:
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, RelationExtractor
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation)
from cnt.io import (load_entities_from_db, load_designs, 
                    load_ocre_designs, replace_left_right)
from cnt.train_test import train_test_annotate
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import score_precision_recall, score_accuracy
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec, Doc2Vec)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
import spacy
import datetime
from itertools import product
from tqdm import tqdm_notebook

In [3]:
english_designs = load_designs()
english_designs.head()

  cursor.execute('SELECT @@tx_isolation')


Unnamed: 0,DesignID,DesignEng
0,1,Diademed head of deified Alexander the Great w...
1,2,Diademed head of deified Alexander the Great w...
2,3,"Altar entwined by serpent, head to left."
3,4,"Altar entwined by serpent, head to right."
4,5,"Altar on two levels, flaming."


In [4]:
import yaml
import_path = "../coin_workspace/data/raw/relation_annotation.yaml"
with open(import_path) as f:
    dictionary = yaml.load(f)
    d = {replace_left_right(key): value for key, value in dictionary.items()}

In [5]:
relation_counts = {}
for sentence, relations in d.items():
    for rel in relations:
        rel_name = rel[1]
        if rel_name not in relation_counts:
            relation_counts[rel_name] = 1
        else:
            relation_counts[rel_name] += 1

sorted(relation_counts.items(), key= lambda x: (-x[1], x[0]))

[('holding', 2022),
 ('wearing', 1523),
 ('resting_on', 219),
 ('seated_on', 75),
 ('standing', 63),
 ('drawing', 41),
 ('grasping', 8),
 ('stepping_on', 8),
 ('hurling', 1),
 ('lying', 1)]

In [15]:
X_list = []
y_list = []  # for each design a list of (subj, relation_class_label, obj)
for sentence, relations in d.items():
    X_list.append(sentence)
    list_of_annotations = []
    y_list.append(list_of_annotations)
    for rel in relations:
        if rel[1] != "": 
            list_of_annotations.append((rel[0], "PERSON", rel[1], rel[2], "OBJECT"))
y_list         

[[('Artemis', 'PERSON', 'wearing', 'boots', 'OBJECT'),
  ('Artemis', 'PERSON', 'holding', 'bow', 'OBJECT'),
  ('Artemis', 'PERSON', 'drawing', 'arrow', 'OBJECT'),
  ('Artemis', 'PERSON', 'wearing', 'chiton', 'OBJECT')],
 [('Athena', 'PERSON', 'wearing', 'helmet', 'OBJECT'),
  ('Athena', 'PERSON', 'holding', 'shield', 'OBJECT')],
 [('Oiskos', 'PERSON', 'holding', 'grape vine', 'OBJECT'),
  ('Oiskos', 'PERSON', 'resting_on', 'water-urn', 'OBJECT')],
 [],
 [('Genius', 'PERSON', 'holding', 'patera', 'OBJECT'),
  ('Genius', 'PERSON', 'holding', 'cornucopia', 'OBJECT')],
 [('Hygieia', 'PERSON', 'holding', 'patera', 'OBJECT')],
 [('Euridice', 'PERSON', 'wearing', 'himation', 'OBJECT'),
  ('Euridice', 'PERSON', 'wearing', 'veil', 'OBJECT'),
  ('Orpheus', 'PERSON', 'holding', 'figurine', 'OBJECT'),
  ('Hermes', 'PERSON', 'holding', 'Euridice', 'OBJECT'),
  ('Hermes', 'PERSON', 'holding', 'chlamys', 'OBJECT')],
 [('Euridice', 'PERSON', 'wearing', 'himation', 'OBJECT'),
  ('Euridice', 'PERSON', '

In [16]:
X = pd.DataFrame({"DesignEng": X_list, "y" : y_list})
X = english_designs.merge(X)
X.head()

Unnamed: 0,DesignID,DesignEng,y
0,10,"Draped bust of (youthful) Anchialos to right, ...","[(Anchialos, PERSON, wearing, taenia, OBJECT)]"
1,23,"Bare head of Antoninus Pius to right, with tra...",[]
2,24,"Bare-headed bust of Antoninus Pius to right, w...","[(Antoninus Pius, PERSON, wearing, cuirass, OB..."
3,25,Draped bust of Antoninus Pius to right.,[]
4,26,Laureate and draped bust of Antoninus Pius to ...,[]


In [8]:
classifier = LogisticRegression()
string_converter = Path2Str(pos=True)
vectorizer = CountVectorizer(ngram_range=(1,3))
feature = make_pipeline(string_converter, vectorizer)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X[["DesignID", "DesignEng"]], X[["DesignID", "y"]], test_size=0.25)

In [10]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(),
                         FeatureExtractor(),
                         RelationExtractor(inner_pipeline))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

  cursor.execute('SELECT @@tx_isolation')


In [11]:
y_pred.head()

Unnamed: 0,DesignID,y
1010,1934,"[(Dionysus, PERSON, wearing, chiton, OBJECT), ..."
942,1824,"[(Zeus, PERSON, holding, patera, OBJECT), (Zeu..."
1183,2433,"[(Strymon, PERSON, resting_on, rock, OBJECT), ..."
963,1854,"[(Isis, PERSON, wearing, headdress, OBJECT), (..."
1422,2811,[]


In [12]:
cnt_pipeline_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in y_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Person", "Label_Person", "Relation", "Object", 
                     "Label_Object"])
cnt_pipeline_output.to_sql("cnt_pipeline_output", 
                           "mysql://cnt:rJnW6m7kZR@localhost:3306/thrakien_cnt", 
                           if_exists="replace", index=False)

  cursor.execute('SELECT @@tx_isolation')


In [13]:
precision, recall = score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [14]:
precision

0.9066193853427896

In [15]:
recall

0.8391684901531729

In [16]:
F1

0.8715909090909091

In [17]:
ocre_designs = load_ocre_designs()
preprocessed_ocre_designs = replace_left_right(ocre_designs)

  cursor.execute('SELECT @@tx_isolation')


In [18]:
ocre_designs.head()

Unnamed: 0,DesignID,DesignEng
0,http://numismatics.org/ocre/id/ric.3.ant.868#o...,"Head of Antoninus Pius, laureate, right"
1,http://numismatics.org/ocre/id/ric.3.ant.1206A...,"Head of Antoninus Pius, laureate, right"
2,http://numismatics.org/ocre/id/ric.3.ant.1206B...,"Head of Antoninus Pius, laureate, right"
3,http://numismatics.org/ocre/id/ric.3.ant.1208A...,"Head of Antoninus Pius, laureate, right"
4,http://numismatics.org/ocre/id/ric.3.ant.1208B...,"Head of Antoninus Pius, laureate, right"


In [19]:
preprocessed_ocre_designs.head()

Unnamed: 0,DesignID,DesignEng
0,http://numismatics.org/ocre/id/ric.3.ant.868#o...,"Head of Antoninus Pius, laureate, right."
1,http://numismatics.org/ocre/id/ric.3.ant.1206A...,"Head of Antoninus Pius, laureate, right."
2,http://numismatics.org/ocre/id/ric.3.ant.1206B...,"Head of Antoninus Pius, laureate, right."
3,http://numismatics.org/ocre/id/ric.3.ant.1208A...,"Head of Antoninus Pius, laureate, right."
4,http://numismatics.org/ocre/id/ric.3.ant.1208B...,"Head of Antoninus Pius, laureate, right."


In [20]:
ocre_pred = pipeline.predict(preprocessed_ocre_designs)

In [21]:
ocre_pred

Unnamed: 0,DesignID,y
0,http://numismatics.org/ocre/id/ric.3.ant.868#o...,[]
1,http://numismatics.org/ocre/id/ric.3.ant.1206A...,[]
2,http://numismatics.org/ocre/id/ric.3.ant.1206B...,[]
3,http://numismatics.org/ocre/id/ric.3.ant.1208A...,[]
4,http://numismatics.org/ocre/id/ric.3.ant.1208B...,[]
5,http://numismatics.org/ocre/id/ric.3.ant.1211#...,[]
6,http://numismatics.org/ocre/id/ric.3.ant.1212#...,[]
7,http://numismatics.org/ocre/id/ric.3.ant.1213#...,[]
8,http://numismatics.org/ocre/id/ric.3.ant.1214#...,[]
9,http://numismatics.org/ocre/id/ric.3.ant.1215#...,[]


In [22]:
ocre_pipeline_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in ocre_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Person", "Label_Person", "Relation", "Object", 
                     "Label_Object"])
ocre_pipeline_output.to_sql("ocre_pipeline_output", 
                            "mysql://cnt:rJnW6m7kZR@localhost:3306/thrakien_cnt", 
                            if_exists="replace", index=False)

  cursor.execute('SELECT @@tx_isolation')


In [23]:
cnt_designs = load_designs()
cnt_designs.head()
cnt_pred = pipeline.predict(cnt_designs)
cnt_pipeline_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Person", "Label_Person", "Relation", "Object", 
                     "Label_Object"])
cnt_pipeline_output.to_sql("cnt_pipeline_output", 
                           "mysql://cnt:rJnW6m7kZR@localhost:3306/thrakien_cnt", 
                           if_exists="replace", index=False)

  cursor.execute('SELECT @@tx_isolation')
