In [1]:
import os
import json

os.chdir('..')
os.listdir('data/seeds')

['adversary-org.jsonl']

### Read data 

In [2]:
# Reading from jsonlines file
with open('data/seeds/adversary-org.jsonl', 'rb') as f:
    lines = f.readlines()
lines = [json.loads(line.decode('utf-8')) for line in lines]
line = lines[0]

In [3]:
len(line)

3

### Convert into span format

In [4]:
import torch
from transformers import AdamW

from src.data import Data_Handler
from src.loss import intra_doc_loss
from src.model import Sparta_Ent

handler = Data_Handler()
model = Sparta_Ent()
optimizer = AdamW(params = model.parameters(), lr=5e-5)
optimizer.zero_grad()

references = []
for line in lines:
    references.extend( handler.process_spacy_annot(line) )
    
unique_texts = list(set([reference.doc.text for reference in references]))
references

[<src.data.Doc_Span at 0x1da07e4ae08>,
 <src.data.Doc_Span at 0x1da07e4ae88>,
 <src.data.Doc_Span at 0x1da06b4bd88>,
 <src.data.Doc_Span at 0x1da06b4b7c8>,
 <src.data.Doc_Span at 0x1da06b4b348>,
 <src.data.Doc_Span at 0x1da07b507c8>,
 <src.data.Doc_Span at 0x1da06b53808>,
 <src.data.Doc_Span at 0x1da09b631c8>,
 <src.data.Doc_Span at 0x1da09b63648>]

### Encode the answer spans

In [6]:
answer_text = unique_texts[0]

In [7]:
for i in range(4):
    optimizer.zero_grad()
    for answer_text in unique_texts:
        answer_spans = [reference for reference in references if reference.doc.text==answer_text]
        reference_spans = [reference for reference in references if reference.doc.text!=answer_text]

        loss = intra_doc_loss(model, answer_spans, reference_spans, max_span_width=4)
        loss/(len(references)**2)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

### Test around

In [14]:
{"text": "The activity of the advanced hacker group the researchers call Silence has increased significantly over the past year. Victims in the financial sector are scattered across more than 30 countries and financial losses have quintupled.\n The group started timidly in 2016, learning the ropes by following the path beaten by other hackers. Since then, it managed to steal at least $4.2 million, initially from banks in the former Soviet Union, then from victims in Europe, Latin America, Africa, and Asia.\n Researchers at Group-IB, Singapore-based cybersecurity company specializing in attack prevention, tracked Silence early on and judged its members to be familiar with white-hat security activity.\n A report last year details the roles of Silence hackers, their skills, failures, and successful bank heists.", "meta": {"source": "bleeping-628-0"}, "spans": [{"start": 134, "end": 150, "label": "ENT", "token_start": 22, "token_end": 24, "_private_string": "financial sector"}]}

{'text': 'The activity of the advanced hacker group the researchers call Silence has increased significantly over the past year. Victims in the financial sector are scattered across more than 30 countries and financial losses have quintupled.\n The group started timidly in 2016, learning the ropes by following the path beaten by other hackers. Since then, it managed to steal at least $4.2 million, initially from banks in the former Soviet Union, then from victims in Europe, Latin America, Africa, and Asia.\n Researchers at Group-IB, Singapore-based cybersecurity company specializing in attack prevention, tracked Silence early on and judged its members to be familiar with white-hat security activity.\n A report last year\xa0details the roles of Silence hackers, their skills, failures, and successful bank heists.',
 'meta': {'source': 'bleeping-628-0'},
 'spans': [{'start': 134,
   'end': 150,
   'label': 'ENT',
   'token_start': 22,
   'token_end': 24,
   '_private_string': 'financial s

In [8]:
ref_text = "The APT10 group (also known as Red Tears) is responsible and is linked to the North Korean nexus, and is charged with discharging the CryptoMix virus, retrieving money, phone numbers, details and credentials"
ref_doc = handler.process_sentence(ref_text)
reference_encoding = model.encode_span(ref_doc, (1,2))

In [21]:
{"text": "According to documents added to an amended complaint filed on January 17, a Russian intelligence-coordinated phishing attack allegedly targeted the Democratic National Committee (DNC) just a few days after the 2018 midterms.\n Moreover, as detailed in the court documents, \"On November 14, 2018, dozens of DNC email addresses were targeted in a spear-phishing campaign, although there is no evidence that the attack was successful.\"\n The documents were filed as part of a lawsuit against Russia's government, as well as the Trump campaign, for an alleged hack which led to a trove of internal DNC emails being stolen and disclosed during 2016.\n As revealed by the DNC, multiple links are connecting the actor behind the November phishing attack with a Russian hacker group known as Cozy Bear (also classified as APT29, Office Monkeys, CozyCar, The Dukes, CozyDuke, or Grizzly Steppe).\n Cozy Bear connected to attacks against U.S. targets in 2014\n Evidence found by Kaspersky's Kurt Baumgartner and CostiRaiu back in 2015 shows that Cozy Bear has been previously connected to attacks targeting both commercial and government entities from Germany, South Korea, Uzbekistan, and the USA, including the White House and the US State Department in 2014.\n", "meta": {"source": "bleeping-1436-10"}, "spans": [{"start": 663, "end": 666, "label": "ENT", "token_start": 131, "token_end": 132, "_private_string": "DNC"}]}

{'text': 'According to documents added to an amended complaint filed on January 17, a Russian intelligence-coordinated phishing attack allegedly targeted the Democratic National Committee (DNC) just a few days after the 2018\xa0midterms.\n Moreover, as detailed in the court documents, "On November 14, 2018, dozens of DNC email addresses were targeted in a spear-phishing campaign, although there is no evidence that the attack was successful."\n The documents were filed as part of a lawsuit against Russia\'s government, as well as\xa0the Trump campaign, for an alleged hack which led to a\xa0trove of internal DNC emails being stolen and disclosed during 2016.\n As revealed by the DNC, multiple links are connecting the actor behind the November phishing attack with a Russian hacker group known as\xa0Cozy Bear (also classified as APT29, Office Monkeys, CozyCar, The Dukes, CozyDuke, or Grizzly Steppe).\n Cozy Bear connected to attacks against U.S. targets in 2014\n Evidence found by Kaspersk

In [22]:
from allennlp.data.dataset_readers.dataset_utils.span_utils import enumerate_spans

text = 'We believe TEMP.Mana, a Chinese cyber espionage group, is linked to infrastructure spoofing domains of at least two U.S. chemical manufacturers. Similar activity suspected of being tied to TEMP.Mana reinforces the risk to the chemical sector and related industries.'
# text = "During a compromise of a hospitality organization, which we attribute to FIN7, the attackers deployed various malicious payloads, including CARBANAK, BABYMETAL, and the PILLOWMINT point-of-sale (POS) malware. Notably, in this incident, FIN7 leveraged a seemingly compromised email account belonging one of the victim's suppliers to send a password-protected GRIFFON document to a restaurant manager. FIN7's shift to engaging in third-party compromise attacks marks a significant evolution in the group's operations and the manner in which organizations need to protect themselves against their attacks."
text = "GREF Team is an innovative China-nexus cyber espionage operator that focuses on targeting energy, video game development studios, and high-tech companies in the U.S., South Korea, the Netherlands, Italy, France, and Japan. The group leverages a large library of malware tools, compromised digital certificates, and exploits. Besides being linked to a common supplier shared with other Chinese cyber espionage groups, GREF Team is believed to distribute its tools to other distinct China-based actors."
# text = "In 1905, a year sometimes described as his annus mirabilis ('miracle year'), Einstein published four groundbreaking papers.[12] These outlined the theory of the photoelectric effect, explained Brownian motion, introduced special relativity, and demonstrated mass-energy equivalence. Einstein thought that the laws of classical mechanics could no longer be reconciled with those of the electromagnetic field, which led him to develop his special theory of relativity. He then extended the theory to gravitational fields; he published a paper on general relativity in 1916, introducing his theory of gravitation. In 1917, he applied the general theory of relativity to model the structure of the universe."
# text = "In May 2019, FireEye devices detected and blocked a spear-phishing campaign likely targeting Ukrainian government entities distributing the ARMEDCLOUD malware attributed to TEMP.Armageddon. Additional samples and related infrastructure were subsequently uncovered. This newest version of ARMEDCLOUD contains anti-forensic features."
# text = 'A Chinese threat group was using hacking tools developed by the NSA more than a year before Shadow Brokers leaked them in April 2017, tools that were later used in highly destructive attacks such as the WannaCry ransomware\xa0campaign from May 2017.\n The Buckeye threat group (also known to researchers as Gothic Panda,\xa0TG-0110, UPS, and APT3) has been active since at least 2010, it is credited by experts for running\xa0Operation Clandestine Fox, Operation Clandestine Wolf, and Operation Double Tap\xa0[1, 2, 3], and for mainly attacking U.S. entities with a sudden switch to Hong Kong targets back in 2015.\n The indictment of three APT3 members by the U.S. government in November 2017 is the thing that really brought the group in the spotlight, with the three Chinese hackers being accused of infiltrating the computing systems of\xa0Moody’s Analytics, Siemens, and Trimble\xa0between 2011 and May 2017.\n As unearthed by Symantec, the Chinese-backed Buckeye was using NSA hacking tools 13 months before they were leaked by Shadow Brokers—the hacking group who stole them—in April 2017, together with a "previously unknown Windows zero-day vulnerability that Symantec discovered (which has since been patched by Microsoft)."\n\n Starting with March 2016,\xa0the NSA\xa0DoublePulsar backdoor was detected\xa0as part of Buckeye campaigns, while being dropped with the help of the Bemstour Trojan, a malware dropper specifically created by the group to deliver the NSA malware payload.\n Symantec discovered that\xa0the variant used by Buckeye during their attacks was newer than the one leaked by Shadow Brokers, with an extra layer of obfuscation which might indicate that the Chinese hackers customized it before deployment on their victims\' systems.\n'
text = "The activity of the advanced hacker group the researchers call Silence has increased significantly over the past year. Victims in the financial sector are scattered across more than 30 countries and financial losses have quintupled.\n The group started timidly in 2016, learning the ropes by following the path beaten by other hackers. Since then, it managed to steal at least $4.2 million, initially from banks in the former Soviet Union, then from victims in Europe, Latin America, Africa, and Asia.\n Researchers at Group-IB, Singapore-based cybersecurity company specializing in attack prevention, tracked Silence early on and judged its members to be familiar with white-hat security activity.\n A report last year\xa0details the roles of Silence hackers, their skills, failures, and successful bank heists"
text = "Security researchers have discovered an ongoing cryptojacking campaign which infects unpatched computers of businesses from all over the world with XMRig Monero miners using Equation group's leaked exploit toolkit.\n The cybercriminals behind this cryptomining campaign use the NSA-developed EternalBlue and EternalChampion SMB exploits to compromise vulnerable Windows computers, exploits which were leaked by the Shadow Brokers hacker group in April 2017.\n While Microsoft patched the security flaws these tools abused to break into Windows machines [1, 2, 3], there are still a lot of exposed computers because they haven't been updated to newer OS versions not being impacted by these very dangerous vulnerabilities.\n The campaign's targets\n \"The campaign seems to be widespread, with targets located in all regions of the world. Countries with large populations such as China and India also had the most number of organizations being targeted,\" say Trend Micro's researchers, the ones who unearthed this ongoing cryptojacking campaign targeting companies from all over the world.\n"
text = 'The Buhtrap hacking group has switched its targets from Rusian financial businesses and institutions since December 2015 when it moved into cyber-espionage operations, culminating with the use of a recently patched Windows zero-day during June 2019.\n The\xa0Windows local privilege escalation 0-day vulnerability tracked as\xa0CVE-2019-1132\xa0and abused by Buhtrap as part of its attacks was fixed by Microsoft during this month\'s Patch Tuesday and it allowed the cyber-crime group to run\xa0arbitrary code in kernel mode after successful exploitation.\n Even though actively targeting banking clients since 2014,\xa0Buhtrap\'s attacks were only detected one year later, in 2015, when it started going after more high-profile victims like financial institutions according to Group-IB and ESET researchers.\n "From August 2015 to February 2016 Buhtrap\xa0managed to conduct 13 successful attacks against\xa0Russian banks for a total amount of 1.8 billion\xa0rubles ($25.7 mln)," says a Group-IB report.\n The Windows\xa0zero-day exploited by Buhtrap\nESET\xa0researchers were able to observe how the hacker group\'s "toolset has been expanded with malware used to conduct espionage in Eastern Europe and Central Asia" in multiple\xa0targeted campaigns.\n Buhtrap\u200b\u200b\u200b\u200b\u200b\u200b\'s\xa0zero-day vulnerability exploit was used during June 2019 in an attack against a governmental institution and it is designed to\xa0abuse "a NULL pointer dereference in the win32k.sys component" on computers running older Windows versions.\n'
text = 'According to documents added to an amended complaint filed on January 17, a Russian intelligence-coordinated phishing attack allegedly targeted the Democratic National Committee (DNC) just a few days after the 2018\xa0midterms.\n Moreover, as detailed in the court documents, "On November 14, 2018, dozens of DNC email addresses were targeted in a spear-phishing campaign, although there is no evidence that the attack was successful."\n The documents were filed as part of a lawsuit against Russia\'s government, as well as\xa0the Trump campaign, for an alleged hack which led to a\xa0trove of internal DNC emails being stolen and disclosed during 2016.\n As revealed by the DNC, multiple links are connecting the actor behind the November phishing attack with a Russian hacker group known as\xa0Cozy Bear (also classified as APT29, Office Monkeys, CozyCar, The Dukes, CozyDuke, or Grizzly Steppe).\n Cozy Bear connected to attacks against U.S. targets in 2014\n Evidence found by Kaspersky\'s\xa0Kurt Baumgartner and\xa0CostiRaiu\xa0back in 2015 shows that Cozy Bear has been previously connected to attacks targeting both commercial and government entities from Germany, South Korea, Uzbekistan, and the USA, including the White House and the US State Department in 2014.\n'
doc = handler.process_sentence(text)

span_tuples = enumerate_spans(doc.doc, max_span_width=4)
span_encodings = model.encode_spans(doc, span_tuples)

span_scores = {span_tuple:model.encoding_sim_score(query_encoding, reference_encoding) 
               for span_tuple, query_encoding in zip(span_tuples, span_encodings)}
span_scores = {k:v for k,v in span_scores.items() if not torch.isnan(v)}

sorted_span_scores = {doc[k[0]:k[1]]: v for k, v in 
                      sorted(span_scores.items(), key=lambda item: item[1], reverse=True)}
sorted_span_scores

{Bear: tensor(0.7062, grad_fn=<MeanBackward0>),
 Bear: tensor(0.7062, grad_fn=<MeanBackward0>),
 
  Cozy Bear: tensor(0.7061, grad_fn=<MeanBackward0>),
 Cozy Bear: tensor(0.7061, grad_fn=<MeanBackward0>),
 
  Cozy: tensor(0.7061, grad_fn=<MeanBackward0>),
 Cozy: tensor(0.7061, grad_fn=<MeanBackward0>),
 Cozy Bear: tensor(0.7060, grad_fn=<MeanBackward0>),
 Cozy: tensor(0.7059, grad_fn=<MeanBackward0>),
 DNC: tensor(0.7055, grad_fn=<MeanBackward0>),
 the DNC: tensor(0.7051, grad_fn=<MeanBackward0>),
 DNC: tensor(0.7050, grad_fn=<MeanBackward0>),
 the: tensor(0.7037, grad_fn=<MeanBackward0>),
 DNC: tensor(0.7035, grad_fn=<MeanBackward0>),
 phishing: tensor(0.7025, grad_fn=<MeanBackward0>),
 phishing: tensor(0.7016, grad_fn=<MeanBackward0>),
 Bear: tensor(0.7014, grad_fn=<MeanBackward0>),
 Russian: tensor(0.6991, grad_fn=<MeanBackward0>),
  Cozy Bear: tensor(0.6990, grad_fn=<MeanBackward0>),
 Cozy Bear: tensor(0.6990, grad_fn=<MeanBackward0>),
  Cozy: tensor(0.6978, grad_fn=<MeanBackward0>

### Verifying that the cosine similarity is correct

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
a = model.encode_span(ref_doc, (1,2))
b = model.encode_span(doc, (23,26))
cosine_similarity( a[[0],:].cpu().detach().numpy(), b[[0],:].cpu().detach().numpy() )

In [None]:
from torch.nn import Linear

In [94]:
proj = Linear(768,768)

In [95]:
proj(a)

tensor([[-0.0495, -0.6669,  1.2174,  ..., -1.2202, -0.8717,  0.1417],
        [-0.0501, -0.6667,  1.2201,  ..., -1.2154, -0.8732,  0.1476],
        [-0.0487, -0.6681,  1.2198,  ..., -1.2178, -0.8719,  0.1417],
        [-0.0429, -0.6773,  1.2194,  ..., -1.2138, -0.8789,  0.1387]],
       grad_fn=<AddmmBackward>)

In [88]:
torch.randn(128, 20).shape

torch.Size([128, 20])

In [93]:
m = torch.nn.Linear(20, 30)
m(torch.randn(128, 20)).shape

torch.Size([128, 30])

In [89]:
a.shape

torch.Size([4, 768])

In [59]:
import torch

def sim_matrix(a, b, eps=1e-8):
    """
    added eps for numerical stability
    https://stackoverflow.com/questions/50411191/how-to-compute-the-cosine-similarity-in-pytorch-for-all-rows-in-a-matrix-with-re
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

sim_matrix(a,b)

tensor([[0.5674, 0.6196, 0.6027, 0.5693, 0.6706],
        [0.5965, 0.6589, 0.6245, 0.5827, 0.6389],
        [0.6002, 0.6513, 0.6368, 0.5948, 0.6508],
        [0.5797, 0.6124, 0.6188, 0.5760, 0.6335]], grad_fn=<MmBackward>)

In [22]:
a,b = torch.rand(100,120), torch.rand(100,120)

In [25]:
torch.rand?

In [24]:
torch.mm(a, b.T)

tensor([[32.0635, 32.5763, 30.2766,  ..., 35.2818, 31.7165, 29.6526],
        [27.3492, 29.9627, 27.8327,  ..., 30.0390, 29.7116, 29.6280],
        [30.8267, 31.5046, 28.4552,  ..., 30.6816, 30.8437, 30.2798],
        ...,
        [27.9617, 29.6158, 25.4629,  ..., 27.6005, 27.5641, 27.2670],
        [31.6830, 31.5710, 29.7579,  ..., 33.2545, 32.4530, 31.2850],
        [29.2131, 31.7744, 28.9604,  ..., 34.6619, 32.3138, 29.6406]])

In [None]:
torch.mm(a_norm, b_norm.transpose(0, 1))

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
import numpy as np
cosine_similarity(np.array([[5,2,-1,-10]]), -1*np.array([[5,2,-1,-10]]))

array([[-1.]])