## Notebook Description

**Process**: 

1. Get 200 articles from the DB
2. Perform clustering analysis on these articles to identify those that are similar 
3. Manually look at these clusters to identify common traits that we can use for tagging 

IDEAS:

- Keyword tagging? Might be computationally expensive but could let it run in the background while doing other things
- Do we want to use a predefined set of tags or simply group similar articles?

## Imports


In [1]:
# For data importing and preprocessing
import pandas as pd
import json

# For clustering
from sklearn.cluster import KMeans      # KNN clustering (k means clustering)
from data_analysis.ClusteringTechniques import *

# For NER 
import spacy    # "pip install spacy" AND "python -m spacy download en_core_web_sm"

# For Rule Based Tagging
import re


# Custom classes
from FP_Classes.RSS_Feed import RSS_Article
from FP_Classes.RSS_DB_Connection import RSS_DB_Connection


In [1]:
from data_analysis.ClusteringTechniques import *
from FP_Classes.RSS_DB_Connection import RSS_DB_Connection
import json 
import re 

from FP_Classes.RSS_Feed import RSS_Feed, RSS_Article
from FP_Classes.Tag import Tag
from FP_Classes.Feeds.BleepingComputer import BleepingComputerRSS   # BleepingComputer
from FP_Classes.Feeds.Censys import CensysRSS                       # Censys (general)
from FP_Classes.Feeds.CensysDir import CensysDirRSS                 # Censys (director)
from FP_Classes.Feeds.DefenseDepartment import DefenseDeptRSS       # Department of Defense
from FP_Classes.Feeds.Microsoft import MicrosoftRSS                 # Microsoft 
from FP_Classes.Feeds.NationalVulnDatabase import NVD_RSS           # National Vulnerability Database
from FP_Classes.Feeds.NIST import NIST_RSS                          # NIST 
from FP_Classes.Feeds.StateDepartment import StateDeptRSS           # State Department (multiple feeds) - NOTE: This one takes a very long time so I removed it for now
from FP_Classes.Feeds.TheHackerNews import HackerNewsRSS            # Hacker News (news articles)


config = json.load(open('config/config.json'))
db_creds = json.load(open('config/' + config['db-creds-json-path']))

dbConn:RSS_DB_Connection = RSS_DB_Connection(
                                username=db_creds['username'],
                                password=db_creds['password'],
                                host=db_creds['host']
                            )

# Initialization of all variables
configDir = "config/"                                   # Change if you changed the default file hierarchy 

# Update the remote DB with new tags if there are any
if dbConn.newTagsFromExcel(configDir + config['update-tags-filepath']):
    print("[+] NOTICE: Added new tags to remote database.")
else: 
    print("[+] ERROR: Failed to add new tags to the remote database. Moving on.")

# Initialize all Feed objects 
bleepingComputerRss = BleepingComputerRSS()
#censysRss = CensysRSS()
#censysDirRss = CensysDirRSS()
defenseDeptRss = DefenseDeptRSS()
microsoftRss = MicrosoftRSS()
nvdRss = NVD_RSS()
nistRss = NIST_RSS()
#stateDeptRss = StateDeptRSS()      # Intentionally commented out - see above import statement
hackernewsRss = HackerNewsRSS()

# Create a list of all the RSS Feed objects 
allFeeds:list[RSS_Feed] = [
    bleepingComputerRss,
    #censysRss,
    #censysDirRss,
    defenseDeptRss,
    microsoftRss,
    nvdRss,
    nistRss,
    #stateDeptRss,  # Intentionally commented out - see above import statement
    hackernewsRss
]

allTags:list[Tag] = dbConn.getAllTags()


NOTICE in RSS_DB_Connection.newTagsFromExcel(): called newTagsFromExcel() - beginning process.
NOTICE in RSS_DB_Connection.newTagsFromExcel(): excel sheet read and DB connection established successfully. Formatting query...
NOTICE in RSS_DB_Connection.newTagsFromExcel(): new tag queries formatted and executed successfully. Terminating connections and quitting.
SUCCESS.
[+] NOTICE: Added new tags to remote database.
[+] Initializing feed: BleepingComputer | https://www.bleepingcomputer.com/feed/
[+] INIT article "BleepingComputer - Amazon's AWS SSM agent can be used as post-exploitation RAT malware"
	[+] Getting article content...
	[+] Preprocessing content...

[+] INIT article "BleepingComputer - Why Every Security Practitioner Should Attend mWISE"
	[+] Getting article content...
	[+] Preprocessing content...

[+] INIT article "BleepingComputer - Hackers exploited Salesforce zero-day in Facebook phishing attack"
	[+] Getting article content...
	[+] Preprocessing content...

[+] INIT a

In [2]:
config = json.load(open('config/config.json'))
db_creds = json.load(open('config/' + config['db-creds-json-path']))

dbConn:RSS_DB_Connection = RSS_DB_Connection(
                                username=db_creds['username'],
                                password=db_creds['password'],
                                host=db_creds['host']
                            )

all_tags = dbConn.getAllTags()
all_articles = dbConn.getAllArticles()


#### Testing Gensim/LDA Clustering

In [None]:
lda:LDA_Article_Clustering = LDA_Article_Clustering(all_articles, num_topics=20, limit=1000)

In [None]:
s = lda.strAllTopicAssignments()
print(s)

In [None]:
s2 = ""
for i in range(len(lda.topics_dict.keys())):
    s2 += lda.strInfoForTopic(i)
    
print(s2)

### Testing NER (Named Entity Recognition)


*NOTE: not effective*

In [None]:
# Load a pre-trained spacy model
nlp_model = spacy.load('en_core_web_sm')

article = all_articles[8]
article.__getArticleContent__()
tokens, content = article.__contentPreprocessing__(article.__getArticleContent__())

sample_article_text = content

#print(sample_article_text)

# Process the text 
doc = nlp_model(sample_article_text)

for entity in doc.ents: print("ENTITY: " + entity.text.replace(r'\n', '') + f" | LABEL: {entity.label_}")

### Rule Based Tagging (keyword searching)

In [3]:
tag_names:list[str] = []

for t in all_tags: 
    if t.caseSensitive: tag_names.append(t.tagName)
    else: tag_names.append(t.tagName.lower())
    tag_names.append(t.tagName)

In [14]:
import re
def rule_based_tagging(text:str, tag_names:list[str]): 
    
    tagstr:str = "|".join(tag_names)
    tagPattern:re.Pattern = re.compile(r'\b(' + fr'{tagstr}' + r')\b')
    
    foundTags:list[str] = []
    
    if tagPattern.search(text): foundTags = tagPattern.findall(text)
    
    return list(set(foundTags))

In [15]:

for testArticle in bleepingComputerRss.articles:
    print(testArticle.toString())
    foundTags = rule_based_tagging(testArticle.preprocessed_content, tag_names)
    print(foundTags)

BleepingComputer: Amazon's AWS SSM agent can be used as post-exploitation RAT malware
		Link: https://www.bleepingcomputer.com/news/security/amazons-aws-ssm-agent-can-be-used-as-post-exploitation-rat-malware/
		Publish Date: 2023-08-02
		Description: Amazon's AWS SSM agent can be used as post-exploitation RAT malware

['north korean', 'breach']
BleepingComputer: Why Every Security Practitioner Should Attend mWISE
		Link: https://www.bleepingcomputer.com/news/security/why-every-security-practitioner-should-attend-mwise/
		Publish Date: 2023-08-02
		Description: Why Every Security Practitioner Should Attend mWISE

[]
BleepingComputer: Hackers exploited Salesforce zero-day in Facebook phishing attack
		Link: https://www.bleepingcomputer.com/news/security/hackers-exploited-salesforce-zero-day-in-facebook-phishing-attack/
		Publish Date: 2023-08-02
		Description: Hackers exploited Salesforce zero-day in Facebook phishing attack

['hacked']
BleepingComputer: Hackers use new malware to breach