In [148]:
from pprint import pprint
import json

triples = [json.loads(_)["openie"] for _ in open("data/processed/OIE_objects.jsonl")]

triples = [_[0] for _ in triples if len(_) > 0]
triples = triples[0:20]

In [149]:
# Clean, leammatise?
# Noun Phrase
# Word Sense, WordNet
# Paraphrase, PPDB
# Entity Linking
# IDF Token Overlap

ent1_list = list(set([_["subject"] for _ in triples]))
ent2_list = list(set([_["object"] for _ in triples]))

ent_list = list(set().union(ent1_list, ent2_list))
ent_list

['Bitcoin ATMs',
 'crypto law',
 'Lee Jung Hoon',
 'Ripple CEO Brad Garlinghouse',
 'alleged grift',
 'their role',
 'Bahamas Liquidators',
 'Track Stolen User Cash',
 'Lightning Network',
 'Car',
 'after U.S. jobs report',
 '$ 17K as Interest Rates Plunge',
 'US Feds',
 'cautious optimism',
 'Crypto Conglomerate DCG',
 'stablecoins',
 'New Tokens',
 'NFT Collective PROOF Signs',
 'DOJ',
 'FTX',
 'Audi Backed Startup Holoride',
 'Zero Hash',
 'Equities',
 'Congress May Restrict Stock Trading',
 'growth',
 'Morocco',
 'First Instance',
 'Crypto Bank Juno',
 'Next Session',
 'United Talent Agency',
 'definition',
 '2022',
 'Bitcoin',
 'Chains',
 'Polygon Paid Top Solana Projects Y00ts 3M',
 'Hong Kong Brokers',
 'Virtual Asset Trading Law']

In [150]:
import string
import re

def clean_message(message):
    # Be Careful of Spaces
    cleaned = message

    cleaned = cleaned.lower()

    # Specific Symbols: ampersand, greater-than, less-than
    cleaned = re.sub(r"&amp", "and", cleaned)

    # Punctuation
    cleaned = re.sub(r"\b\s’s\s\b", " ", cleaned)
    cleaned = cleaned.translate(str.maketrans('', '', string.punctuation))

    # Remove URLs
    cleaned = re.sub(r"http\S+", "", cleaned)

    # Remove Symbols (Bullets, Punctuations, != Alphanumeric)
    cleaned = re.sub(r'[^\w]', ' ', cleaned)
    cleaned = re.sub("<[^>]*>", "", cleaned)
    cleaned = re.sub('[*”".?!,:;•/&+]', ' ', cleaned)
    cleaned = re.sub("([\(\[]).*?([\)\]])", "", cleaned)
    cleaned = re.sub(r'--'," ", cleaned)

    # ?
    cleaned = re.sub(r'(?<=[.,])(?=[^\s])', r' ', cleaned)
    cleaned = re.sub(r"@[^A-Za-z0-9*]+", "", cleaned)
    cleaned = re.sub(' +', ' ', cleaned)
    cleaned = re.sub(r'(\d+),(\d+),?(\d*)', " ", cleaned)

    # Clean spaces
    cleaned = " ".join(cleaned.split())

    return cleaned

ent_map = {}
for id, ent in enumerate(ent_list):
    ent_map[id] = {}
    ent_map[id]["text"] = clean_message(ent)
    ent_map[id]["wn_mask"] = [0] * len(ent_map[id]["text"].split())
    #ent_map[id]["noun_chunks"] = []

ent_map

{0: {'text': 'bitcoin atms', 'wn_mask': [0, 0]},
 1: {'text': 'crypto law', 'wn_mask': [0, 0]},
 2: {'text': 'lee jung hoon', 'wn_mask': [0, 0, 0]},
 3: {'text': 'ripple ceo brad garlinghouse', 'wn_mask': [0, 0, 0, 0]},
 4: {'text': 'alleged grift', 'wn_mask': [0, 0]},
 5: {'text': 'their role', 'wn_mask': [0, 0]},
 6: {'text': 'bahamas liquidators', 'wn_mask': [0, 0]},
 7: {'text': 'track stolen user cash', 'wn_mask': [0, 0, 0, 0]},
 8: {'text': 'lightning network', 'wn_mask': [0, 0]},
 9: {'text': 'car', 'wn_mask': [0]},
 10: {'text': 'after us jobs report', 'wn_mask': [0, 0, 0, 0]},
 11: {'text': '17k as interest rates plunge', 'wn_mask': [0, 0, 0, 0, 0]},
 12: {'text': 'us feds', 'wn_mask': [0, 0]},
 13: {'text': 'cautious optimism', 'wn_mask': [0, 0]},
 14: {'text': 'crypto conglomerate dcg', 'wn_mask': [0, 0, 0]},
 15: {'text': 'stablecoins', 'wn_mask': [0]},
 16: {'text': 'new tokens', 'wn_mask': [0, 0]},
 17: {'text': 'nft collective proof signs', 'wn_mask': [0, 0, 0, 0]},
 18:

In [151]:
import random
from nltk.wsd import lesk

for _ in ent_map:
    sent = ent_map[_]["text"]
    tokens = sent.split()

    for tok in range(0, len(tokens)):
        res = lesk(sent, tokens[tok])

        # TODO: Make '98' Robust to change
        if len(dir(res)) == 98:
            ent_map[_]["wn_mask"][tok] = res.lemma_names()[0]

In [152]:
ent_map

{0: {'text': 'bitcoin atms', 'wn_mask': [0, 'standard_atmosphere']},
 1: {'text': 'crypto law', 'wn_mask': [0, 'law']},
 2: {'text': 'lee jung hoon', 'wn_mask': ['lee', 'Jung', 0]},
 3: {'text': 'ripple ceo brad garlinghouse',
  'wn_mask': ['ripple', 'chief_executive_officer', 'brad', 0]},
 4: {'text': 'alleged grift', 'wn_mask': ['alleged', 0]},
 5: {'text': 'their role', 'wn_mask': [0, 'role']},
 6: {'text': 'bahamas liquidators', 'wn_mask': ['Bahamas', 'murderer']},
 7: {'text': 'track stolen user cash',
  'wn_mask': ['track', 'steal', 'user', 'cash']},
 8: {'text': 'lightning network', 'wn_mask': ['lightning', 'network']},
 9: {'text': 'car', 'wn_mask': ['car']},
 10: {'text': 'after us jobs report',
  'wn_mask': ['subsequently', 'uranium', 'speculate', 'reputation']},
 11: {'text': '17k as interest rates plunge',
  'wn_mask': [0, 'deoxyadenosine_monophosphate', 'sake', 'rates', 'plunge']},
 12: {'text': 'us feds', 'wn_mask': ['uranium', 'Federal_Reserve_System']},
 13: {'text': 'c

In [153]:
# Entity Link
# Stanza Pipeline
import stanza
nlp = stanza.Pipeline(lang="en", processors="tokenize, mwt, pos, ner")

2023-01-12 13:31:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-01-12 13:31:12 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| ner       | ontonotes |

2023-01-12 13:31:12 INFO: Use device: cpu
2023-01-12 13:31:12 INFO: Loading: tokenize
2023-01-12 13:31:12 INFO: Loading: pos
2023-01-12 13:31:13 INFO: Loading: ner
2023-01-12 13:31:13 INFO: Done loading processors!


In [154]:
for _ in range(0, len(ent_map) - 1):
    sent = ent_map[_]["text"]
    ent_map[_]["ner_mask"] = [0] * len(sent.split())

    doc = nlp(sent)

    for id, sent in enumerate(doc.sentences):
        for ent in sent.ents:
            ent_map[_]["ner_mask"][id] = (ent.text, ent.type)

In [155]:
ent_map

{0: {'text': 'bitcoin atms',
  'wn_mask': [0, 'standard_atmosphere'],
  'ner_mask': [0, 0]},
 1: {'text': 'crypto law', 'wn_mask': [0, 'law'], 'ner_mask': [0, 0]},
 2: {'text': 'lee jung hoon',
  'wn_mask': ['lee', 'Jung', 0],
  'ner_mask': [0, 0, 0]},
 3: {'text': 'ripple ceo brad garlinghouse',
  'wn_mask': ['ripple', 'chief_executive_officer', 'brad', 0],
  'ner_mask': [0, 0, 0, 0]},
 4: {'text': 'alleged grift', 'wn_mask': ['alleged', 0], 'ner_mask': [0, 0]},
 5: {'text': 'their role', 'wn_mask': [0, 'role'], 'ner_mask': [0, 0]},
 6: {'text': 'bahamas liquidators',
  'wn_mask': ['Bahamas', 'murderer'],
  'ner_mask': [0, 0]},
 7: {'text': 'track stolen user cash',
  'wn_mask': ['track', 'steal', 'user', 'cash'],
  'ner_mask': [0, 0, 0, 0]},
 8: {'text': 'lightning network',
  'wn_mask': ['lightning', 'network'],
  'ner_mask': [0, 0]},
 9: {'text': 'car', 'wn_mask': ['car'], 'ner_mask': [0]},
 10: {'text': 'after us jobs report',
  'wn_mask': ['subsequently', 'uranium', 'speculate', 

In [156]:
# Spotlight
import requests
from IPython.core.display import display, HTML

spotlight_url = "http://api.dbpedia-spotlight.org/en/annotate"

headers = {
    "accept": "application/json"
}

NEL = {}

for _ in range(0, len(ent_map)):
    NEL[_] = {}

    params = {
        "text": ent_map[_]["text"],
        "confidence": 0.25
    }
    try:
        res = requests.get(spotlight_url, params=params, headers=headers)
        NEL[_]["spotlight"] = res.json()

    except:
        NEL[_]["spotlight"] = "NIL"
        continue

  from IPython.core.display import display, HTML


In [157]:
pprint(NEL)

{0: {'spotlight': {'@confidence': '0.25',
                   '@policy': 'whitelist',
                   '@sparql': '',
                   '@support': '0',
                   '@text': 'bitcoin atms',
                   '@types': '',
                   'Resources': [{'@URI': 'http://dbpedia.org/resource/Bitcoin_ATM',
                                  '@offset': '0',
                                  '@percentageOfSecondRank': '0.0',
                                  '@similarityScore': '0.9999999999999964',
                                  '@support': '17',
                                  '@surfaceForm': 'bitcoin atms',
                                  '@types': ''}]}},
 1: {'spotlight': {'@confidence': '0.25',
                   '@policy': 'whitelist',
                   '@sparql': '',
                   '@support': '0',
                   '@text': 'crypto law',
                   '@types': '',
                   'Resources': [{'@URI': 'http://dbpedia.org/resource/Cryptography',
   

In [160]:
# TODO: Mask
# TODO: Advanced Meta Data

def locate(text, surface_form):
    items = text.split()
    len_ = len(surface_form.split())

    for idx, _ in enumerate(items):
        if len_ > 1:
            surface_form = surface_form.split()[0]

        if _.startswith(surface_form):
            return (idx, idx + len_)

        else: None

count = 0
for _ in range(0, len(ent_map)):
    count += 1

    if NEL[_]["spotlight"] == "NIL":
        continue

    else:

        link_text = ent_map[_]["text"].split()

        if "Resources" in NEL[_]["spotlight"]:
            for ent in NEL[_]["spotlight"]["Resources"]:
                surface_form = ent["@surfaceForm"]
                uri = ent["@URI"]

                try:
                    start, finish = locate(ent_map[_]["text"], surface_form)

                except:
                    continue

                if start != finish:
                    # Slice Assignment:
                    link_text[start: finish] = [f"[{surface_form}, {uri}]"]

                elif start == finish:
                    link_text[start] = f"[{surface_form}, {uri}]"

            disambiguated = " ".join(_ for _ in link_text)
            ent_map[_]["NEL_text"] = disambiguated
            print(ent_map[_]["text"])
            print(disambiguated, "\n")

        else: continue

bitcoin atms
[bitcoin atms, http://dbpedia.org/resource/Bitcoin_ATM] 

crypto law
[crypto, http://dbpedia.org/resource/Cryptography] [law, http://dbpedia.org/resource/Law] 

lee jung hoon
[lee jung, http://dbpedia.org/resource/Lee_Jung] [jung hoon, http://dbpedia.org/resource/Jung_Hoon] 

ripple ceo brad garlinghouse
[ripple, http://dbpedia.org/resource/Ripple_marks] [ceo, http://dbpedia.org/resource/Chief_executive_officer] [brad garlinghouse, http://dbpedia.org/resource/Brad_Garlinghouse] 

alleged grift
[alleged, http://dbpedia.org/resource/Allegation] [grift, http://dbpedia.org/resource/Confidence_trick] 

bahamas liquidators
[bahamas, http://dbpedia.org/resource/The_Bahamas] [liquidators, http://dbpedia.org/resource/Liquidation] 

track stolen user cash
[track, http://dbpedia.org/resource/Track_and_field] [stolen, http://dbpedia.org/resource/Stolen_base] [user, http://dbpedia.org/resource/Wikipedia] [cash, http://dbpedia.org/resource/Cash] 

lightning network
[lightning, http://db

In [161]:
ent_map

{0: {'text': 'bitcoin atms',
  'wn_mask': [0, 'standard_atmosphere'],
  'ner_mask': [0, 0],
  'NEL_text': '[bitcoin atms, http://dbpedia.org/resource/Bitcoin_ATM]'},
 1: {'text': 'crypto law',
  'wn_mask': [0, 'law'],
  'ner_mask': [0, 0],
  'NEL_text': '[crypto, http://dbpedia.org/resource/Cryptography] [law, http://dbpedia.org/resource/Law]'},
 2: {'text': 'lee jung hoon',
  'wn_mask': ['lee', 'Jung', 0],
  'ner_mask': [0, 0, 0],
  'NEL_text': '[lee jung, http://dbpedia.org/resource/Lee_Jung] [jung hoon, http://dbpedia.org/resource/Jung_Hoon]'},
 3: {'text': 'ripple ceo brad garlinghouse',
  'wn_mask': ['ripple', 'chief_executive_officer', 'brad', 0],
  'ner_mask': [0, 0, 0, 0],
  'NEL_text': '[ripple, http://dbpedia.org/resource/Ripple_marks] [ceo, http://dbpedia.org/resource/Chief_executive_officer] [brad garlinghouse, http://dbpedia.org/resource/Brad_Garlinghouse]'},
 4: {'text': 'alleged grift',
  'wn_mask': ['alleged', 0],
  'ner_mask': [0, 0],
  'NEL_text': '[alleged, http://db

In [129]:
# import sparql
# import ssl
#
# # INIT SSL
# if hasattr(ssl, '_create_unverified_context'):
#     ssl._create_default_https_context = ssl._create_unverified_context
#
# #entityTypes = ['PERSON','NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']
# #[tok, tok, tok]
#
# def wikification(entities):
#     s = sparql.Service("http://dbpedia.org/sparql", "GET")
#
#     found_num = 0
#     wikified = [0] * len(entities)
#
#     for _ in range(0, len(entities)):
#         entity = entities[_]
#
#         if entity != 0:
#             searchText = entities[_].replace('"', '')
#             curQuery = 'SELECT DISTINCT * WHERE '
#             curQuery = curQuery + '{ ?url rdfs:label "' + searchText + '"@en . '
#             curQuery = curQuery + 'FILTER(STRSTARTS(str(?url), "http://dbpedia.org/resource/"))}'
#             results = s.query(curQuery).fetchall()
#
#             if len(results) > 0:
#                 found_num = found_num + 1
#                 for result in results:
#                     resourceName = sparql.unpack_row(result)[0]
#
#                     if 'Category:' not in resourceName and 'property' not in resourceName:
#                         print(resourceName)
#                         wikified[_] = resourceName
#
#             return wikified

In [130]:
# # Entity Link
#
# for _ in range(0, len(ent_map) - 1):
#     ner_mask = ent_map[_]["ner_mask"]
#     wn_mask = ent_map[_]["wn_mask"]
#
#     combined_ents = [0] * len(ner_mask)
#     for i, (ner_mask, wn_tok) in enumerate(zip(ner_mask, wn_mask)):
#         if type(ner_mask) == tuple:
#             ner_tok, type_ = ner_mask
#
#         else: ner_tok = 0
#
#         if ner_tok == 0 and wn_tok == 0:
#             combined_ents[i] = 0
#
#         elif ner_tok != 0 and wn_tok == 0:
#             combined_ents[i] = str(ner_tok)
#
#         else:
#             combined_ents[i] = str(wn_tok)
#
#
#     ent_map[_]["wiki_link"] = wikification(combined_ents)

http://dbpedia.org/resource/Bahamas
http://dbpedia.org/resource/17


In [62]:
# import spacy
# nlp = spacy.load("en_core_web_sm")
#
# def noun_chunks(text):
#     doc = nlp(text)
#
#     chunks = []
#     for chunk in doc.noun_chunks:
#         chunks.append(chunk.text)
#
#     return " ".join(_ for _ in chunks)
#
# for k, v in ent_map.items():
#     ent_map[k]["noun_chunks"].append(noun_chunks(v["text"]))
#
# pprint(ent_map)

{0: {'noun_chunks': ['bitcoin atms'],
     'text': 'bitcoin atms',
     'wn_mask': [0, 0]},
 1: {'noun_chunks': ['crypto law'], 'text': 'crypto law', 'wn_mask': [0, 0]},
 2: {'noun_chunks': ['lee jung hoon'],
     'text': 'lee jung hoon',
     'wn_mask': [0, 0, 0]},
 3: {'noun_chunks': ['ripple ceo brad garlinghouse'],
     'text': 'ripple ceo brad garlinghouse',
     'wn_mask': [0, 0, 0, 0]},
 4: {'noun_chunks': ['alleged grift'],
     'text': 'alleged grift',
     'wn_mask': [0, 0]},
 5: {'noun_chunks': ['their role'], 'text': 'their role', 'wn_mask': [0, 0]},
 6: {'noun_chunks': ['bahamas liquidators'],
     'text': 'bahamas liquidators',
     'wn_mask': [0, 0]},
 7: {'noun_chunks': ['stolen user cash'],
     'text': 'track stolen user cash',
     'wn_mask': [0, 0, 0, 0]},
 8: {'noun_chunks': ['lightning network'],
     'text': 'lightning network',
     'wn_mask': [0, 0]},
 9: {'noun_chunks': ['car'], 'text': 'car', 'wn_mask': [0]},
 10: {'noun_chunks': ['us jobs'],
      'text': 'a

In [None]:
# sample = random.randint(1, len(ent_map))
#
# sent = ent_map[sample]["text"].split()
# ent_map["ner_mask"] = [0] * len(sent)
# doc = nlp(ent_map[sample]["text"])
#
# ents = []
# for sent in doc.sentences:
#     for ent in sent.ents:
#         ents.append(ent.text, ent.type)

In [4]:
# TODO: More Noun Patterns
# # Noun Phrase Chunking
# import nltk
# from nltk.chunk import RegexpParser
# from nltk import word_tokenize, pos_tag
#
# def reg_chunker(sent, expression = r'NP: {<NN.?>+<NN.?>}'):
#     sent = pos_tag(word_tokenize(sent))
#     cp = nltk.RegexpParser(expression)
#     chunked = cp.parse(sent)
#
#     chunks = []
#     for chunk in chunked.subtrees(filter=lambda t: t.label() == "NP"):
#         chunks.append(str(chunk))
#
#     return " ".join(_ for _ in chunks)

In [None]:
# # Word Sense Dismbiguation
# from nltk.wsd import lesk
#
# sample = ent_map[0]
# sent = sample["text"]
#
# def word_sense(sent):
#     doc = nlp(sent)
#
#     sysnets = []
#     for chunk in doc.noun_chunks:
#         res = lesk(sent, chunk.text)
#
#     return " ".join(_ for _ in sysnets)
#
# word_sense(sent)