In [161]:
import spacy
import requests
from bs4 import BeautifulSoup
from collections import Counter
from typing import List, Set, Tuple
from tqdm import tqdm

In [78]:
#!pip install newspaper
import newspaper

In [12]:
ner_model = spacy.load('en_core_web_sm', 
                       disable=["tok2vec", "tagger", "parser", "attribute_ruler"])

In [18]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet') 
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
lemmatizer = WordNetLemmatizer()

In [3]:
#nlp = spacy.load('en_core_web_md')

# process a sentence using the model
doc = nlp("war")
# It's that simple - all of the vectors and words are assigned after this point
# Get the vector for 'text':
doc.similarity(nlp('genocide'))

0.5655733557541108

In [23]:
KEYWORDS = ['war', 'violence', 'attack', 'death', 'casualty', 'danger']
COS_SIM_THRESHOLD = 0.6

In [139]:
def threatAssessmentPipeline(url: str,
                             ner_model: spacy.lang.en.English,
                             emb_model: spacy.lang.en.English,
                             keyword_anchors: List[str],
                             stopwords: Set[str], 
                             article_date: str = None,
                             cos_sim_threshold: float = 0.6,
                             threat_thresholds: Tuple[float] = (2.0, 5.0)):
  
  # Step 1: Extract article text from URL
  article = newspaper.Article(url)
  article.download()
  article.parse()
  text = article.text

  # Step 2: Perform NER on text
  named_entities = ner_model(text).ents

  # Step 3: Pull out specific named entities
  ent_dict = {}
  # Group entities by label
  for ent in named_entities:
    label = str(ent.label_)
    entity = ent.text
    if label not in ent_dict:
      ent_dict[label] = []
    ent_dict[label].append(str(entity))
  # create entity frequency dict
  ent_dict = {label:dict(Counter(entities)) for label,entities in ent_dict.items()}

  # Get location
  if ent_dict.get('LOC'):
    location = max(ent_dict['LOC'], key=ent_dict['LOC'].get) # we only pull out the location that's mentioned most frequently
  else:
    location = 'LOCATION NOT FOUND'

  def getMostLikelyEntities(ent_dict: dict, 
                            ent_label: str,
                            threshold: int = 3):
    if ent_dict.get(ent_label):
      sorted_entities = sorted(ent_dict[ent_label].items(), key=lambda item: item[1], reverse=True)
      candidate_tuples = sorted_entities[:min(threshold, len(sorted_entities))]
      candidate_entities = [pair[0] for pair in candidate_tuples]
      return candidate_entities
    else:
      return f'{ent_label} NOT FOUND' if ent_label in {'DATE', 'TIME'} else 'ACTORS NOT FOUND'
  
  # Get dates, times, and actors
  candidate_dates = getMostLikelyEntities(ent_dict, 'DATE')
  candidate_times = getMostLikelyEntities(ent_dict, 'TIME')
  candidate_actors = getMostLikelyEntities(ent_dict, 'ORG')

  # Step 4: Pull out relevant keywords
  tokenCheck = lambda token: token.isalnum() and token.lower() not in stopwords # checks for alphanumeric + not a stopword
  preprocessed_text = [lemmatizer.lemmatize(tok) for tok in word_tokenize(text) if tokenCheck(tok)]
  TEXT_LENGTH = len(preprocessed_text)
  freq_dict = Counter(preprocessed_text)
  # Embed keyword anchors
  keyword_embs = [emb_model(kw) for kw in keyword_anchors]
  # Mine relevant keywords from article
  key_terms = set()
  key_term_count = 0
  for word in freq_dict:
    word_emb = emb_model(word)
    if word_emb.vector.any(): # check if vector exists in pretrained model
      for kw_emb in keyword_embs:
        if word_emb.similarity(kw_emb) > cos_sim_threshold: # if a word is similar to any of the anchors, add it to the set
          key_terms.add(word)
          key_term_count += freq_dict[word]
          break
  raw_threat_score = (key_term_count/ARTICLE_LENGTH)*100
  assert len(threat_thresholds) == 2, 'Exactly two threat thresholds must be provided'
  low, high = threat_thresholds
  if raw_threat_score < low:
    warning = 'LOW THREAT'
  elif low <= raw_threat_score < high:
    warning = 'SOME THREAT'
  else:
    warning = 'HIGH THREAT'
  
  # Get list of most prevalent keywords
  if len(key_terms) > 0:
    key_term_freq = {k:freq_dict[k] for k in key_terms}
    sorted_key_terms = sorted(key_term_freq.items(), key=lambda item: item[1], reverse=True)
    candidate_tuples = sorted_key_terms[:min(5, len(sorted_key_terms))]
    candidate_terms = [pair[0] for pair in candidate_tuples]
  else:
    candidate_terms = 'NO RELEVANT KEYWORDS FOUND'

  # Step 5: Package it all into a dictionary
  res = {'URL': url,
         'Threat message': warning,
         'Raw threat rating': raw_threat_score,
         'Possible location': location,
         'Possible dates': candidate_dates,
         'Possible times': candidate_times,
         'Possible actors': candidate_actors,
         'Keywords': candidate_terms}
  if article_date:
    res['Article date': article_date]
  
  return res

In [140]:
# Test pipeline on single url

# constants for testing
#TEST_URL = 'https://www.theguardian.com/world/2022/mar/01/ukraine-russia-civilians-missiles-kyiv-tv-tower'
TEST_URL = 'https://www.cnn.com/2022/03/03/europe/ukraine-kharkiv-civilian-strikes-intl-cmd/index.html'
NER_MODEL = ner_model
EMB_MODEL = nlp
KEYWORD_ANCHORS = KEYWORDS
STOPWORDS = stopwords.words('english')
THREAT_THRESHOLDS = (2.0, 4.0)

In [142]:
# Test pipeline on a negative example (not related to Russia-Ukraine war, or any other conflict)
NEG_URL = 'https://www.cnn.com/2022/03/02/tech/apple-march-event/index.html'
threatAssessmentPipeline(url=NEG_URL,
                         ner_model=NER_MODEL,
                         emb_model=EMB_MODEL,
                         keyword_anchors=KEYWORD_ANCHORS,
                         stopwords=STOPWORDS)

{'Keywords': ['invasion'],
 'Possible actors': ['Apple', 'UFC', 'AAPL'],
 'Possible dates': ['July 09, 2021', 'the year', 'Wednesday'],
 'Possible location': 'LOCATION NOT FOUND',
 'Possible times': ['10:00 a.m. PT/1:00 p.m. ET'],
 'Raw threat rating': 0.11363636363636363,
 'Threat message': 'LOW THREAT',
 'URL': 'https://www.cnn.com/2022/03/02/tech/apple-march-event/index.html'}

In [144]:
try:
  from apiclient.discovery import build
except:
  !pip install google-api-python-client
  from apiclient.discovery import build

In [145]:
API_KEY = 'AIzaSyDt7I6cgh6LZ3oH_pmzKknomIzkrSyNkY8'
SEARCH_ID = '21b36bd225e5f4360' # general search engine that searches the entire web
QUERY = 'Russia-Ukraine' # search query

In [146]:
resource = build(serviceName='customsearch', 
                 version='v1',
                 developerKey=API_KEY).cse()

In [147]:
result = resource.list(q=QUERY, 
                       cx=SEARCH_ID,
                       siteSearch='www.cnn.com',
                       siteSearchFilter='i').execute()

In [160]:
result['items'][8]['pagemap']['metatags'][0]['og:url']

'https://www.cnn.com/travel/article/russia-ukraine-hurt-travel-recovery-cmd/index.html'

In [171]:
DOMAINS_TO_QUERY = ['www.cnn.com', 
                    'www.msn.com',
                    'www.foxnews.com',
                    'www.nytimes.com',
                    'www.news.google.com',
                    'www.washingtonpost.com',
                    'www.nypost.com',
                    'www.cnbc.com',
                    'www.news.yahoo.com',
                    'www.dailymail.co.uk',
                    'www.bbc.com',
                    'www.usatoday.com',
                    'www.people.com',
                    'www.theguardian.com',
                    'www.nbcnews.com',
                    'www.businessinsider.com',
                    'www.forbes.com',
                    'www.huffpost.com',
                    'www.usnews.com',
                    'www.thehill.com',
                    'www.bloomberg.com']

In [None]:
threat_assessments = [] # list of pipeline results
for dom in tqdm(DOMAINS_TO_QUERY):
  print()
  print(f'Querying {dom} . . .')
  search_results = resource.list(q=QUERY, 
                       cx=SEARCH_ID,
                       siteSearch=dom,
                       siteSearchFilter='i').execute()
  try:
    search_items = search_results['items']
    print('Looping over search results, analyzing articles . . .')
    for i in tqdm(range(len(search_items))):
      res = search_items[i]
      url = res['pagemap']['metatags'][0]['og:url']
      try:
        threat_assessment = threatAssessmentPipeline(url=url,
                            ner_model=NER_MODEL,
                            emb_model=EMB_MODEL,
                            keyword_anchors=KEYWORD_ANCHORS,
                            stopwords=STOPWORDS)
        threat_assessments.append(threat_assessment)
      except:
        continue
  except:
    continue

In [178]:
import json
from google.colab import drive

In [179]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
threat_assessments

In [177]:
print(f'Returned {len(threat_assessments)} threat assessments from {len(DOMAINS_TO_QUERY)} domains')

Returned 148 threat assessments from 21 domains


In [182]:
with open("/content/drive/MyDrive/hacktech2022/threat_assessments.json", "w") as outfile:
    json.dump(threat_assessments, outfile)