developed by Patricia Klinger, modified by Sebastian Gampe

In [8]:
import pandas as pd
import random
import os
from cnt.model import DesignEstimator
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation)
from cnt.io import (load_entities_from_file, load_entities_from_db,
                    load_ocre_designs)
from cnt.train_test import train_test_annotate
from cnt.extract_relation import path
from cnt.evaluate import score_precision_recall, score_accuracy
from spacy import displacy

In [9]:
# 5000 erhöhen, bessere Ergebnisse?
designs = load_ocre_designs()
english_designs = designs[:5000]
english_designs.head()

  cursor.execute('SELECT @@tx_isolation')


Unnamed: 0,DesignID,DesignEng
0,http://numismatics.org/ocre/id/ric.3.ant.868#o...,"Head of Antoninus Pius, laureate, right"
1,http://numismatics.org/ocre/id/ric.3.ant.1206A...,"Head of Antoninus Pius, laureate, right"
2,http://numismatics.org/ocre/id/ric.3.ant.1206B...,"Head of Antoninus Pius, laureate, right"
3,http://numismatics.org/ocre/id/ric.3.ant.1208A...,"Head of Antoninus Pius, laureate, right"
4,http://numismatics.org/ocre/id/ric.3.ant.1208B...,"Head of Antoninus Pius, laureate, right"


In [10]:
# create dictionary entities: key = label, value = entities
mysql_connection = "mysql://cnt:rJnW6m7kZR@localhost:3306/thrakien_cnt"
entities = {
    "PERSON": load_entities_from_db("nlp_list_person", mysql_connection),
    "OBJECT": load_entities_from_db("nlp_list_obj", mysql_connection),
    "ANIMAL": load_entities_from_db("nlp_list_animal", mysql_connection),
    "PLANT": load_entities_from_db("nlp_list_plant", mysql_connection)
}
entities["PERSON"][:3]

  cursor.execute('SELECT @@tx_isolation')


['Agrippina minor', 'Agrippina maior', 'Alexander III']

In [11]:
annotated_designs = annotate_designs(entities, english_designs)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]
annotated_designs.head()

Unnamed: 0,DesignEng,DesignID,annotations
0,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.868#o...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
1,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.1206A...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
2,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.1206B...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
3,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.1208A...,"[(0, 4, OBJECT), (8, 22, PERSON)]"
4,"Head of Antoninus Pius, laureate, right",http://numismatics.org/ocre/id/ric.3.ant.1208B...,"[(0, 4, OBJECT), (8, 22, PERSON)]"


In [12]:
# sklearn train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[["DesignID", "DesignEng"]],
                                                    annotated_designs[["DesignID", "annotations"]], 
                                                    test_size=0.25)

In [13]:
n_rep = 3
my_estimator = DesignEstimator(n_rep)
my_estimator.fit(X_train, y_train.annotations)
train_score = score_accuracy(y_train.rename(columns = {"annotations" : "y"}), my_estimator.predict(X_train))
test_score = score_accuracy(y_test.rename(columns = {"annotations" : "y"}), my_estimator.predict(X_test))
res = {"n_rep": n_rep, "train_score": train_score,
       "test_score": test_score}

In [14]:
res

{'n_rep': 3,
 'test_score': 0.9639711769415532,
 'train_score': 0.9727709556860651}

In [15]:
precision, recall = score_precision_recall(y_test.rename(columns = {"annotations" : "y"}), my_estimator.predict(X_test))
res.update({"precision":precision, "recall":recall, "split": "random"})

In [16]:
precision

0.994991652754591

In [17]:
recall

0.9854497354497355

In [18]:
y_pred = my_estimator.predict(X_test)

In [19]:
y_pred.head()

Unnamed: 0,DesignID,y
3807,http://numismatics.org/ocre/id/ric.3.m_aur.121...,"[(0, 4, OBJECT), (8, 23, PERSON)]"
2137,http://numismatics.org/ocre/id/ric.2.tr.654#re...,"[(9, 15, PERSON)]"
4847,http://numismatics.org/ocre/id/ric.3.ant.1308A...,"[(0, 7, PERSON), (51, 58, PERSON), (99, 105, O..."
3498,http://numismatics.org/ocre/id/ric.3.m_aur.226...,"[(0, 4, OBJECT), (8, 23, PERSON)]"
1978,http://numismatics.org/ocre/id/ric.2.tr.77#obv...,"[(0, 4, OBJECT), (8, 14, PERSON)]"


In [20]:
# load designs again from OCRE database and use the trained model on them
ocre_designs = load_ocre_designs()
ocre_designs.head()

ocre_pred = my_estimator.predict_clear(ocre_designs)


  cursor.execute('SELECT @@tx_isolation')


In [21]:
ocre_designs.head()

Unnamed: 0,DesignID,DesignEng
0,http://numismatics.org/ocre/id/ric.3.ant.868#o...,"Head of Antoninus Pius, laureate, right"
1,http://numismatics.org/ocre/id/ric.3.ant.1206A...,"Head of Antoninus Pius, laureate, right"
2,http://numismatics.org/ocre/id/ric.3.ant.1206B...,"Head of Antoninus Pius, laureate, right"
3,http://numismatics.org/ocre/id/ric.3.ant.1208A...,"Head of Antoninus Pius, laureate, right"
4,http://numismatics.org/ocre/id/ric.3.ant.1208B...,"Head of Antoninus Pius, laureate, right"


In [22]:
ocre_pred_predictions_only = ocre_pred["y"]
ocre_prediction_output = pd.DataFrame({"DesignID" : ocre_designs["DesignID"],
                                      "X_test" : ocre_designs["DesignEng"], 
                                  "y_predict" : ocre_pred_predictions_only})
ocre_prediction_output.head()
#cnt_prediction_output["y_predict"]

Unnamed: 0,DesignID,X_test,y_predict
0,http://numismatics.org/ocre/id/ric.3.ant.868#o...,"Head of Antoninus Pius, laureate, right","[(Head, OBJECT), (Antoninus Pius, PERSON)]"
1,http://numismatics.org/ocre/id/ric.3.ant.1206A...,"Head of Antoninus Pius, laureate, right","[(Head, OBJECT), (Antoninus Pius, PERSON)]"
2,http://numismatics.org/ocre/id/ric.3.ant.1206B...,"Head of Antoninus Pius, laureate, right","[(Head, OBJECT), (Antoninus Pius, PERSON)]"
3,http://numismatics.org/ocre/id/ric.3.ant.1208A...,"Head of Antoninus Pius, laureate, right","[(Head, OBJECT), (Antoninus Pius, PERSON)]"
4,http://numismatics.org/ocre/id/ric.3.ant.1208B...,"Head of Antoninus Pius, laureate, right","[(Head, OBJECT), (Antoninus Pius, PERSON)]"


In [23]:
ocre_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in ocre_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

ocre_ner_output.to_sql("ocre_pipeline_ner", 
                           "mysql://cnt:rJnW6m7kZR@localhost:3306/thrakien_cnt", 
                           if_exists="replace", index=False)

  cursor.execute('SELECT @@tx_isolation')
