In [14]:
%run wikidata_functions.ipynb

3


In [6]:
import pandas as pd
import numpy as np
import spacy
from spacy.matcher import Matcher 
import spacy_universal_sentence_encoder
import claucy   
from bs4 import BeautifulSoup

2023-12-09 07:23:36.086263: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
import re
import random
import pickle
import time
import functools

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
ps = ['P530', 'P463', 'P150', 'P2936']

extrs = {}
for p in ps:
    extrs[p] = pickle.load(open(f'models/logreg_{p}_2000.pickle', 'rb'))

In [10]:
nlp = spacy.load('en_core_web_md')
claucy.add_to_pipe(nlp)
use= spacy.load('en_use_md')

In [11]:
def spacy_pos_map2(pos):
    if pos in ["NOUN", "PRON", "PROPN", "NN", "NND", "NNPS", "NNS", "PRP", "PRP$"]:
        return 1
    elif pos in ["VERB", "MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
        return 2
    elif pos in ["ADV", "RB", "RBR", "RBS", "RP"]:
        return 3
    elif pos in ["ADJ", "JJ", "JJR", "JJS"]:
        return 4
    elif pos in ["NUM", "CD"]:
        return 5
    elif pos == "FW":
        return 6
    else:
        return 7 # everything else 

In [12]:
def sentences_to_Xsents(sents, labels, nlp, use, limit=50):
    assert(len(sents) == len(labels))
    
    n = len(sents)
    n_fts = limit + limit + 512
    # words + pos + universal sentence encoding
    
    X = np.zeros((n, n_fts))
    sents_ = np.empty(n, dtype=object)

    i = 0
    for z, sent in enumerate(sents):
        doc = nlp(sent)
        if len(doc) > limit: continue #too long
            
        for k, tok in enumerate(doc):
            X[i][k] = int.from_bytes(str(tok).encode(), 'big')
            X[i][k + limit] = spacy_pos_map2(tok.pos_)
        
        sem = use(sent) #universal sentence encoding
        X[i][2*limit:] = sem.vector

        # label
        sents_[i] = sent
        i += 1

    return X[:i], sents_[:i]

In [15]:
# list of entities to query

entities = ["United States"]
articles = []
for e in entities:
    l = search_wikipedia(e)
    # print(l)
    if l:
        articles.append(l[0])

print(articles)

contents = []
for a in articles:
    try:
        page = query_wikipedia(a)
        contents.append(page.content)
    except:
        print(f"{a} page not found")

# a little cleaning
for c in contents:
    i = c.find("== References ==")
    c_ = c[:i]
    c_ = re.sub('==.*==', '', c_) # get rid of wikipedia section headers
    c_ = re.sub('\([^\)]*\)', '', c_) # get rid of everything between parentheses
    

    doc = nlp(c_)
    sents = map(lambda s : str(s).replace("\n", ""), doc.sents)

    # for s in list(sents)[:5]:
    #     doc_s = nlp(s)
    #     print(s)
    #     propositions = doc_s._.clauses[0].to_propositions(as_text=True)
    #     print(propositions)

    sents = list(sents)
    X, sents = sentences_to_Xsents(sents, len(sents)*[False], nlp, use, limit=50)
    # for s in sents:
        # doc_s = nlp(s)
        # ent1, ent2 = get_entities(doc_s)
        # rel = get_relation(doc_s)

        # try:
        #     Q1 = get_qid(ent1)
        #     Q2 = get_qid(ent2)
        #     P = get_pid_from_str(rel)
        # except:
        #     continue

        # # print(Q1, P, Q2)
            
        # if check_triple(Q1, P, Q2):
        #     x.append(s)
        #     y.append(True)
        # elif check_sp(Q1, P):
        #     # print(Q1, P, Q2)
        #     x.append(s)
        #     y.append(False)
        # else:
        #     print(Q1, Q2, P)
        #     print(ent1, ent2, rel)
        #     pass

        

['United States']


In [16]:
y_preds = {}
for p in ps:
    ex = extrs[p]
    y_pred = ex.predict(X)
    y_preds[p] = y_pred

In [19]:
for i, s in enumerate(sents[:20]):
    print(s)
    hits = []
    for p in ps:
        if y_preds[p][i]:
            hits.append(p)
    print(hits)

The United States of America , commonly known as the United States  or America, is a country primarily located in North America.
['P463']
It consists of 50 states, a federal district, five major unincorporated territories, and nine Minor Outlying Islands.
['P463', 'P150', 'P2936']
It includes 326 Indian reservations.
['P463']
The U.S. is the world's third-largest country by land area, and by total area.
['P150', 'P2936']
It shares land borders with Canada to its north and with Mexico to its south and has maritime borders with the Bahamas, Cuba, Russia, and other nations.
['P530', 'P463', 'P150']
With a population of over 333 million, it is the most populous country in the Americas and the third-most populous in the world.
['P463', 'P150']
The national capital of the United States is Washington, D.C., and its most populous city and principal financial center is New York City.
['P463', 'P2936']
Indigenous peoples have inhabited the Americas for thousands of years.
['P530', 'P463', 'P150'

In [71]:
hit_rates = []

In [89]:
hr = []
for p in ps:
    hr.append(sum(y_preds[p])/len(sents))
hit_rates.append(hr)

In [1]:
hit_rates = [[0.4543269230769231,
  0.8533653846153846,
  0.5889423076923077,
  0.4254807692307692],
 [0.4004237288135593, 0.7521186440677966, 0.5190677966101694, 0.375],
 [0.42430703624733473,
  0.8656716417910447,
  0.6140724946695096,
  0.3837953091684435],
 [0.43410852713178294,
  0.8656330749354005,
  0.6149870801033591,
  0.3695090439276486]]

In [3]:
for i, hr in enumerate(hit_rates):
    hit_rates[i] = list(map(lambda r : round(r, 2), hr))

In [4]:
hit_rates

[[0.45, 0.85, 0.59, 0.43],
 [0.4, 0.75, 0.52, 0.38],
 [0.42, 0.87, 0.61, 0.38],
 [0.43, 0.87, 0.61, 0.37]]