In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
import nltk
import numpy as np
from tqdm import tqdm
import torch
import pickle
import os
from sentence_transformers import SentenceTransformer, util
# from allennlp.predictors.predictor import Predictor # Uncomment if performing SRL

nltk.download('punkt')
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cuda device


[nltk_data] Downloading package punkt to /home/kiran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load News Narratives Corpus

In [2]:
# news_narratives path
news_filename = '../data/event_knowledge_2.0/news_narratives.txt'
doc_to_text = {}
last_doc_id = ''
with open(news_filename) as news_narratives:
  for line in news_narratives:
    if line == '\n':
      continue
    if line[:8] == '<doc_id>':
      doc_to_text[line[9:].replace('\n', '')] = ''
      last_doc_id = line[9:].replace('\n', '')
    if line[:6] == '<word>':
      doc_to_text[last_doc_id] += line[7:].replace('\n', ' ')
print("Corpus Size:", len(doc_to_text))
print("Example:", doc_to_text['WPB_ENG_20100127.0025.1:6'])

Corpus Size: 74589
Example: Within days of his inauguration , President Barack Obama signed executive orders to close the military prison at Guantanamo Bay within one year and to end torture in interrogation . He missed the Jan. 22 deadline to close Guantanamo but reaffirmed this month that he intends to close the prison as soon as possible . Obama has maintained other elements of the previous administration 's methods to capture and hold terrorism suspects . He has kept the military commission system to try certain terrorism suspects after strengthening evidentiary rules on behalf of defendants . He also preserved the authority to capture terrorism suspects in foreign countries , a practice known as extraordinary rendition . But he tightened the rules for where those captures can be made , limiting them to countries that do not have an effective rule of law . 


In [3]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
corpus = []
corpusidx_to_doc = {}
idx = 0
for k,v in tqdm(doc_to_text.items()):
  sentences = sent_detector.tokenize(v)
  for s in sentences:
    corpus.append(s)
    corpusidx_to_doc[idx] = k
    idx += 1

100%|████████████████████████████████████████████████████████████████████████████████████████| 74589/74589 [00:05<00:00, 12922.84it/s]


In [4]:
def corpus_idx_to_idx_in_doc(corpus_idx):
    docid = corpusidx_to_doc[corpus_idx]
    i = corpus_idx - 1
    while i >= 0:
        if corpusidx_to_doc[i] != docid:
            return corpus_idx - i - 1
        i -= 1
    return -1

# TF-IDF Retrieval

In [None]:
def tokenize(text):
  # stemmer = PorterStemmer()
  tokens = nltk.word_tokenize(text)
  # stems = [stemmer.stem(item) for item in tokens]
  return tokens

t = "Israel Israeli airstrike airstrikes"
print(tokenize(t))

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
vectorizer.fit_transform(doc_to_text.values())

  'stop_words.' % sorted(inconsistent))


<74589x94942 sparse matrix of type '<class 'numpy.float64'>'
	with 2772920 stored elements in Compressed Sparse Row format>

In [None]:
# 2 min
doc_to_vector = {}
for k,v in doc_to_text.items():
  doc_to_vector[k] = vectorizer.transform([v])

In [None]:
def cosine_sim(a, b):
  a = np.transpose(a)
  a = np.asarray(a)
  a = np.squeeze(a)
  b = np.transpose(b)
  b = np.asarray(b)
  b = np.squeeze(b)
  cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
  return cos_sim

In [None]:
def query_corpus(query, k):
  query_vector = vectorizer.transform([query]).toarray()
  doc_to_score = {}
  for key,value in doc_to_vector.items():
    doc_to_score[key] = cosine_sim(query_vector, value.toarray())
  return sorted(doc_to_score, key=doc_to_score.get, reverse=True)[:k]

query_corpus('Israel Israeli airstrike airstrikes', 10)

['APW_ENG_20030908.0692.22:25',
 'APW_ENG_20030909.0105.30:33',
 'LTW_ENG_20060801.0084.3:6',
 'LTW_ENG_20060828.0002.339:342',
 'LTW_ENG_20061008.0024.142:147',
 'LTW_ENG_20081231.0117.5:10',
 'APW_ENG_20071127.0339.1:4',
 'APW_ENG_20020522.1105.18:21',
 'APW_ENG_20070803.0048.28:32',
 'NYT_ENG_20020306.0162.17:20']

# Load Sentence Transformer (and embeddings)

In [5]:
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')

In [9]:
# ~343k sentences over 75k passages
# Generate Embeddings
# corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)
# os.makedirs('../data/corpus_embeddings/news_narratives/', exist_ok=True)
# torch.save(corpus_embeddings, '../data/corpus_embeddings/news_narratives/mpnet_emb.pt')
# Load Embeddings
corpus_embeddings = torch.load('../data/corpus_embeddings/news_narratives/mpnet_emb.pt', map_location=torch.device('cpu'))

# Sentence Transformer Retrieval

In [None]:
def get_docs_for_corpusids(cid_and_score, query):
  docs = {}
  for c in cid_and_score:
    docid = corpusidx_to_doc[c['corpus_id']]
    # if docid in docs:
    #   docs[docid].append(c['score'])
    # else:
    if docid not in docs:
      docs[docid] = c['score']
  # for k, v in docs.items():
  #   if len(v) > 1:
  #     score = 0
  #     for i in range(len(v)):
  #       score += (1 / (i+1)) * score
  #     docs[k] = score
  #   else:
  #     docs[k] = v[0]
  x = sorted(docs, key=docs.get, reverse=True) #[:300]
  c = 1
  for d in x:
    s = ""
    for word in query.split(" "):
      if word in doc_to_text[d].split(" "):
        s += word + " "
    print(c, d, docs[d], s)
    c += 1
  docs_and_scores = []
  for d in x:
    docs_and_scores.append([d, docs[d]])
  return docs_and_scores


In [None]:
query = 'homes destroyed in wake of hurricane'
query_embeddings = model.encode(query, convert_to_tensor=True)
query_embeddings.to(device)
corpus_embeddings.to(device)
qs = util.semantic_search(query_embeddings, corpus_embeddings, top_k=2000)
d_and_s = get_docs_for_corpusids(qs[0], 'homes destroyed hurricane')

In [None]:
sents = []
doc_list = []
for d in d_and_s:
  for s in sent_detector.tokenize(doc_to_text[d[0]]):
    sents.append(s)
    doc_list.append(d[0])
# sents = sent_detector.tokenize(doc_to_text['WPB_ENG_20100127.0025.1:6'])
pairs = util.paraphrase_mining(model, sents, top_k=3)
for p in pairs:
  print(p[0])
  print(sents[p[1]], doc_list[p[1]])
  print(sents[p[2]], doc_list[p[2]])
  print()

In [None]:
def get_docs_for_sentids(sentids, k):
  docs = {}
  for s in sentids:
    docid = s.split('|')[0]
    if docid in docs:
      docs[docid] += 1
    else:
      docs[docid] = 1
  return sorted(docs, key=docs.get, reverse=True)[:k]

def query_corpus_emb(query, k):
  # query_vector = vectorizer.transform([query]).toarray()
  query_vector = model.encode(query)
  sentid_to_score = {}
  for key,value in tqdm(sentid_to_emb.items()):
    sentid_to_score[key] = util.cos_sim(torch.tensor(query_vector).to(device), torch.tensor(value).to(device))
  top_sentids = sorted(sentid_to_score, key=sentid_to_score.get, reverse=True)[:k]
  return get_docs_for_sentids(top_sentids, 5)

query_corpus_emb('Israel Israeli airstrike airstrikes', 25)

# Sentence Embedding Clustering

In [10]:
# Initial Round of Clustering to break up corpus (due to GPU constraints)
num_clusters = 10
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(sentence_id)
os.makedirs('../data/clustering/news_narratives/', exist_ok=True)
outfile = open('../data/clustering/news_narratives/clustered_news_narratives_10.pkl','wb')
pickle.dump(clustered_sentences, outfile)
outfile.close()

In [None]:
# debug
# clustered_sentences = pickle.load(open('../data/clustered_news_narratives_10.pkl', 'rb'))
# for i, cluster in enumerate(clustered_sentences):
#     print("Cluster ", i+1)
#     print(len(cluster))
#     # print(cluster)
#     print("")

In [None]:
# debug
# clusters = pickle.load(open('../data/communities_t7_min25/cluster_0_communities.pkl', 'rb'))
# for i, cluster in enumerate(clusters):
#     print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
#     for sentence_id in cluster[0:3]:
#         print("\t", corpusidx_to_doc[clustered_sentences[0][sentence_id]])
#     print("\t", "...")
#     for sentence_id in cluster[-3:]:
#         print("\t", corpusidx_to_doc[clustered_sentences[0][sentence_id]])

In [13]:
# get top_k words in a document
def get_keywords_for_doc(doc, top_k):
  vectorizer = TfidfVectorizer(stop_words='english', use_idf=False, ngram_range =(1, 2))
  doc_tf_vector = vectorizer.fit_transform([doc])
  doc_arr = doc_tf_vector.toarray()[0]
  ind = np.argpartition(doc_arr, -top_k)[-top_k:]
  top_words_scores = {}
  for i in ind:
    top_words_scores[vectorizer.get_feature_names()[i]] = doc_arr[i]
  return sorted(top_words_scores, key=top_words_scores.get, reverse=True)

In [14]:
# Returns set of documents based on sentences in cluster
def get_docs_in_cluster(cluster):
  docs = []
  for idx in cluster:
    docs.append(corpusidx_to_doc[clustered_sentences[i][idx]])
  return set(docs)

In [12]:
# Load initial clustering results
clustered_sentences = pickle.load(open('../data/clustering/news_narratives/clustered_news_narratives_10.pkl', 'rb'))

os.makedirs('../data/clustering/news_narratives/communities_t7_min25/', exist_ok=True)
# Perform further clustering on each K-Means cluster
for i, cluster in tqdm(enumerate(clustered_sentences)):
  embeddings = torch.index_select(corpus_embeddings, 0, torch.tensor(cluster))
  communitites = util.community_detection(embeddings, threshold=0.7, min_community_size=25)
  outfile = open('../data/clustering/news_narratives/communities_t7_min25/cluster_' + str(i) + '_communities.pkl', 'wb')
  pickle.dump(communitites, outfile)
  outfile.close()

10it [00:50,  5.02s/it]


In [15]:
# Generate Clustering Summary

# Load initial clustering results
clustered_sentences = pickle.load(open('../data/clustering/news_narratives/clustered_news_narratives_10.pkl', 'rb'))

with open('../data/clustering/news_narratives/news_narrative_clustering_results.txt', 'w') as f:
  print("News Narratives Clustering Results", file=f)
  print("Embedding Model = all-mpnet-base-v2", file=f)
  print("First clustered into 10 large subclusters via KNN, then clustered by cosine similarity (https://sbert.net/examples/applications/clustering/README.html#fast-clustering)", file=f)
  print("Threshold = 0.7, Min Community Size = 25", file=f)
  print("", file=f)
  for i, cluster in tqdm(enumerate(clustered_sentences)):
    print("==="*10, file=f)
    print("", file=f)
    print("Large Cluster", i, file=f)
    cluster_group = pickle.load(open('../data/clustering/news_narratives/communities_t7_min25/cluster_' + str(i) + '_communities.pkl', 'rb'))
    for j, sub_cluster in enumerate(cluster_group):
      overall_doc = ""
      for idx in sub_cluster:
        overall_doc += doc_to_text[corpusidx_to_doc[clustered_sentences[i][idx]]] + " "
      print("\t Cluster", j, "-", get_keywords_for_doc(overall_doc, 5), file=f)
      # print("\t\t Docs: ", get_docs_in_cluster(sub_cluster), file=f) # uncomment if you want to include list of documents in each cluster
    print("", file=f)

10it [00:01,  6.45it/s]


In [16]:
# Perform further clustering on a set of events using Agglomerative Clustering
def cluster_events(events, dist_thresh=0.4):
  event_embeddings = model.encode(events, convert_to_tensor=True)
  event_embeddings = event_embeddings.to('cpu')
  event_embeddings = event_embeddings /  np.linalg.norm(event_embeddings, axis=1, keepdims=True)
  clustering_model = AgglomerativeClustering(n_clusters=None, affinity='cosine', linkage='average', distance_threshold=dist_thresh)
  clustering_model.fit(event_embeddings)
  cluster_assignment = clustering_model.labels_
  clustered_sentences = {}
  for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []
    clustered_sentences[cluster_id].append(events[sentence_id])
  clustered_sentences = dict(sorted(clustered_sentences.items()))
  print(len(clustered_sentences), "clusters for", len(events), "sentences")
  print("---"*10)
  srl_events = 0
  sent_count = 0
  for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    srls = get_srl_for_sentences(cluster)
    sent_count += len(cluster)
    for j, c in enumerate(srls):
      print("\t", j, ":", c)
      srl_events += 1
    # for c in cluster:
    #   print(c)
  print(srl_events, "Total SRL Events from", sent_count , "sentences")
  return clustered_sentences

# SRL For Clusters

In [None]:
# Load SRL Model
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz", cuda_device=0)

Plugin allennlp_models could not be loaded: No module named 'nltk.translate.meteor_score'
downloading: 100%|##########| 405972254/405972254 [00:14<00:00, 28879132.73B/s]


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Returns SRL parses for list of sentences
def get_srl_for_sentences(sentences):
    sentence_srls = []
    for s in sentences:
        sentence_srls.append(predictor.predict(sentence=s))
    parsed_results = []
    for srl in sentence_srls:
        # print(srl)
        for parse in srl['verbs']:
            # Skip Parses w/o Arguments
            found_args = False
            for j, word in enumerate(srl['words']):
                if "ARG" in parse['tags'][j]:
                    found_args = True
            if not found_args:
                continue

            # Skip Parses based around reporting verbs
            reporting_verbs = ['said', 'says']
            if parse['verb'] in reporting_verbs:
                continue

            event = ""
            for j, word in enumerate(srl['words']):
                if "ARG" in parse['tags'][j] or 'V' in parse['tags'][j]:
                    event += word + " "
            event = event[:len(event)-1]
            # print(event)
            parsed_results.append(event)
    return parsed_results

In [None]:
def get_srl_for_cluster(cluster, cluster_num, subcluster_num):
  doc_srls = []
  for subcluster_idx in cluster[subcluster_num]:
    doc_id = corpusidx_to_doc[clustered_sentences[cluster_num][subcluster_idx]]
    text = doc_to_text[doc_id]
    sentences = sent_detector.tokenize(text)
    sentence_srls = []
    for s in sentences:
      sentence_srls.append(predictor.predict(sentence=s))
    # print(sentence_srls)
    doc_srls.append(sentence_srls)
  return doc_srls

In [17]:
def get_events_before_after_anchor_no_srl(cluster, cluster_num, subcluster_num, anchor):
  before = []
  after = []
  not_found_anchor = 0
  for subcluster_idx in cluster[subcluster_num]:

    corpus_idx = clustered_sentences[cluster_num][subcluster_idx]
    doc_id = corpusidx_to_doc[corpus_idx]
    text = doc_to_text[doc_id]
    sentences = sent_detector.tokenize(text)

    # Pull Anchor from Clustered Sentence
    anchor_sent_idx = corpus_idx_to_idx_in_doc(corpus_idx)
    if anchor_sent_idx == -1:
      print("Error finding idx of clustered sent in doc")
      continue
    # Make sure anchor sentence has anchor in it
    words = [w.lower() for w in sentences[anchor_sent_idx].split(" ")]
    if anchor not in words:
      not_found_anchor += 1
      continue
    
    for i, s in enumerate(sentences):
      # only use sentences in +/- 3 sentence window
      if i < anchor_sent_idx - 3 or i > anchor_sent_idx + 3:
        continue
      if i < anchor_sent_idx:
        before.append(s)
      elif i > anchor_sent_idx:
        after.append(s)

  return set(before), set(after)

In [None]:
# Given the SRL parses for all documents in a cluster determine anchor event via TF
def get_anchor_event(cluster_srls):
  doc = ""
  for doc_srl in cluster_srls:
    for s in doc_srl:
      # print(s)
      words_from_sent = set()
      for parse in s['verbs']:
        # print(parse)
        for i, word in enumerate(s['words']):
          if ("ARG" in parse['tags'][i] or "V" in parse['tags'][i]) and 'ARGM' not in parse['tags'][i] and len(word) > 2:
            words_from_sent.add(i)
      for w in words_from_sent:
        # print(s['words'][w])  
        doc += s['words'][w] + " "
  # print(get_keywords_for_doc(doc, 5))
  return get_keywords_for_doc(doc, 1)[0]

In [None]:
def get_events_before_after_anchor(cluster_srls, anchor_event):
  before = []
  after = []
  for doc_srl in cluster_srls:
    # print(doc_srl)
    # Find first sentence where anchor event is mentioned
    anchor_sent_idx = -1
    for i in range(len(doc_srl)):
      sentence = [w.lower() for w in doc_srl[i]['words']]
      if anchor_event in sentence:
        anchor_sent_idx = i
        print(sentence)
        break
    # print(anchor_sent_idx)
    # if anchor event not in this doc, skip pulling events from it
    if anchor_sent_idx == -1:
      continue
    for i in range(len(doc_srl)):
      # print(doc_srl[i]['words'])
      for parse in doc_srl[i]['verbs']:
        event = ""
        for j, word in enumerate(doc_srl[i]['words']):
          if "ARG" in parse['tags'][j] or 'V' in parse['tags'][j]:
            event += word + " "
        event = event[:len(event)-1]
        # Skip events with 2 words or less
        if len(event.split(" ")) < 3:
          continue
        if i < anchor_sent_idx:
          before.append(event)
        elif i > anchor_sent_idx:
          after.append(event)
  return set(before), set(after)

In [19]:
# Inspect Clustering Summary to get following information
cluster_num = 2
subcluster_num = 7
anchor = 'cancer'
# load initial clustering and second stage cluster
clustered_sentences = pickle.load(open('../data/clustering/news_narratives/clustered_news_narratives_10.pkl', 'rb'))
cluster = pickle.load(open('../data/clustering/news_narratives/communities_t7_min25/cluster_' + str(cluster_num) + '_communities.pkl', 'rb'))

# Uncomment to test w/ SRL
# cluster_srls = get_srl_for_cluster(cluster, cluster_num, subcluster_num)
# anchor = get_anchor_event(cluster_srls)
# events_before, events_after = get_events_before_after_anchor(cluster_srls, anchor)

# print("Events Before (SRL):")
# for e in events_before:
#   print(e)
# print("---"*10)

# print("Events After (SRL):")
# for e in events_after:
#   print(e)
# print("---"*10)

# Uncomment to test w/o SRL
before_no_srl, after_no_srl = get_events_before_after_anchor_no_srl(cluster, cluster_num, subcluster_num, anchor)

print("Events Before:")
for e in before_no_srl:
  print(e)
print("---"*10)

print("Events After:")
for e in after_no_srl:
  print(e)
print("---"*10)

Events Before:
In September , Murphy 's doctors told her the chemo was n't doing the job .
He worked on the line at a General Motors plant for 30 years .
In 1980 , Mankiller came down with myasthenia gravis , a muscle disease .
Nancy Turek , 73 , had breast cancer four years ago .
Johnson lives nearly 400 miles away in Newberry , a bucolic community in the state 's Upper Peninsula .
Patty Holtz developed non-Hodgkin 's lymphoma , a cancer of the lymph nodes , 15 years ago .
He showed her everything he wrote , because she was astute and honest .
She supervised content for public television programs on education .
Five years later , Ahern won the 1972 LPGA Championship , one of three career victories .
In 1979 , she was severely injured in a head-on traffic accident that killed the other driver , her best friend .
Their kids came first in their lives .
She had surgery , radiation and chemotherapy .
She is a retired first-grade teacher .
Susan Kristoff got her first vaccination in January

In [20]:
# Testing further clustering on the set of events we retrieved
before_cluster = cluster_events(list(before_no_srl))
after_cluster = cluster_events(list(after_no_srl))

17 clusters for 18 sentences
------------------------------
Cluster  1


NameError: name 'get_srl_for_sentences' is not defined