# Step1. Generate Data

## 抓 linux commond

In [None]:
from bs4 import BeautifulSoup
import requests
import collections
import re

section_base_url = "https://man7.org/linux/man-pages/dir_section_{}.html"

# Define a function to extract command names and descriptions
def get_commands_from_section(section):
    url = section_base_url.format(section)
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        command_entries = soup.find_all('td', valign="top")

        posix_commands = []
        gnu_commands = []

        for entry in command_entries:
            commands_a = entry.find_all('a')
            for command in commands_a:
                if command != None:
                    text = command.text
                    if text == "intro(1)" or text == "intro(8)": #remove intro
                        continue
                    if "(1p)" in text: #posix
                        text = text.replace("(1p)", "")
                        posix_commands.append(text)
                    elif "(1)" in text:
                        text = text.replace("(1)", "")
                        gnu_commands.append(text)
                    elif "(8)" in text:
                        text = text.replace("(8)", "")
                        gnu_commands.append(text)

        # If a command is in both posix page and gnu page , remove the posix one
        same_commands = list(set(posix_commands).intersection(gnu_commands))
        posix_commands = [i for i in posix_commands if i not in same_commands]

        posix_commands.sort()
        gnu_commands.sort()

        all_commands = posix_commands + gnu_commands

        return [posix_commands, gnu_commands, all_commands]

    else:
        print(f"Failed to fetch section {section}. Status code: {response.status_code}")
        return None

section1_commands = get_commands_from_section(1)
section8_commands = get_commands_from_section(8)

#remove tcpdump in section8
same_commands = list(set(section1_commands[2]).intersection(section8_commands[2]))
for i in [0, 1, 2]:
    section8_commands[i] = [j for j in section8_commands[i] if j not in same_commands]

print(f"Section1: {len(section1_commands[2])}\n")
print(f"Section8: {len(section8_commands[2])}\n")
print(f"Section1 + Section8: {len(section1_commands[2]) + len(section8_commands[2])}\n")

import json

man_page_posix_url = "https://man7.org/linux/man-pages/man{}/{}.{}p.html"
man_page_gnu_url = "https://man7.org/linux/man-pages/man{}/{}.{}.html"

def get_command_description_from_command(section, command, is_posix):
    if is_posix:
        url = man_page_posix_url.format(section, command, section)
    else:
        url = man_page_gnu_url.format(section, command, section)
    while True:
        try:
            response = requests.get(url)
        except:
            continue
        break

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        name_a = soup.find('a', id="NAME")
        name = None
        if name_a is not None:
            name_h2 = name_a.parent
            name = name_h2.findNext('pre').text
            name = name.strip()
            name = name.replace("\n", "")
            name = re.sub(r"([\n])|([\ ]{2,})", "", name)


        description = None
        description_a = soup.find('a', id="DESCRIPTION")
        if description_a is not None:
            description_h2 = description_a.parent
            description = description_h2.findNext('pre').text
            description = description.strip()
            description = description.replace("\\","//").replace(r"\u2014",r"-").replace('\u2022','.')
            description = re.sub(r"([\n])|([\ ]{2,})", "", description)


        return {'name':name, 'description':description}

    else:
        print(f"Failed to fetch command {command}. Status code: {response.status_code}")
        return None

def get_descriptions(section, commands, is_posix):
    command_list = []
    for command in commands:
        print(f'current fetch command: {command}\n')
        result = get_command_description_from_command(section, command, is_posix)
        if result != None:
            result['command'] = command
            result['section'] = section
            command_list.append(result)
    return command_list

res1 = get_descriptions(1, section1_commands[0], True)
res2 = get_descriptions(1, section1_commands[1], False)
res3 = get_descriptions(8, section8_commands[0], True)
res4 = get_descriptions(8, section8_commands[1], False)

res = res1+res2+res3+res4
json = json.dumps(res, sort_keys=True, indent=4)
print(json)

f = open("result.json", "w+")
f.write(json)
f.close()

## 讀取檔案
* 把抓到的commod(result.json)放到google 雲端硬碟的root
* 把[github]((https://github.com/mitre/cti/tree/master/enterprise-attack/x-mitre-tactic)) 的 14個tactic.json 放到google 雲端硬碟root/tactic

讀取 linux commond

In [None]:
import os
import json
import pathlib
import pandas as pd
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dirPath = r"drive/MyDrive/"
f = open(dirPath + 'result.json')
data = json.load(f)

# Use pd.json_normalize to convert the JSON to a DataFrame
commond = pd.json_normalize(data, meta=['command', 'description', 'name', 'section'])

# Rename the columns for clarity
commond.columns = ['command', 'description', 'name', 'section']

# Display the DataFrame
print(commond)
commond = pd.DataFrame(commond)
commond = technique.dropna()
commond.reset_index(drop=True,inplace=True)
commond

讀取 tactic

In [None]:
dirPath = r"drive/MyDrive/tactic"

tmp = []
for f in os.listdir(dirPath):
  if pathlib.Path(f).suffix == ".json":
    data = json.load(open(os.path.join(dirPath, f)))
    tmp.append([data['objects'][0]['name'], data['objects'][0]['description']])

tatic = pd.DataFrame(tmp)
tatic.columns = ['name', 'description']
print(tatic)

## 抓 Technique

In [None]:
import pandas as pd
url = "https://attack.mitre.org/techniques/enterprise/"

tables = pd.read_html(url)

len(tables)
technique = tables[0]

technique = technique.rename(columns={"Description": "description", "Name": "name"})
technique = pd.DataFrame(technique)
technique = technique.dropna()
technique.reset_index(drop=True,inplace=True)
technique

## Technique map to Tactic

In [None]:
import pandas as pd
url = "https://attack.mitre.org/matrices/enterprise/"

tables = pd.read_html(url)

len(tables)
matrix = tables[0]

matrix = pd.DataFrame(matrix)

# Step2. Pre-Processing(SnowballStemmer)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import numpy as np

pd.options.display.max_columns = 30
%matplotlib inline

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"early access review", "early access review ", phrase)
    phrase = re.sub(r"\+", " + ", phrase)
    phrase = re.sub(r"\-", " - ", phrase)
    phrase = re.sub(r"/10", "/10 ", phrase)
    phrase = re.sub(r"10/", " 10/", phrase)
    return phrase

In [None]:
stemmer = SnowballStemmer("english")
def stemming_tokenizer(str_input):
  words = re.sub(r"[^a-zA-Z]{2,}", " ", str_input).lower().split()
  words = [stemmer.stem(word) for word in words]
  return " ".join(words)

In [None]:
def clean_reviews(lst):
    # remove URL links (httpxxx)
    lst = np.vectorize(remove_pattern)(lst, "https?://[A-Za-z0-9./]*")
    # remove special characters, numbers, punctuations (except for #)
    lst = np.core.defchararray.replace(lst, "[^a-zA-Z]", " ")
    # remove amp with and
    lst = np.vectorize(replace_pattern)(lst, "amp", "and")
    # remove hashtags
    lst = np.vectorize(remove_pattern)(lst, "#[A-Za-z0-9]+")
    lst = np.vectorize(remove_pattern)(lst, "#[\w]*")
    return lst
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt
def replace_pattern(input_txt, pattern, replace_text):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, replace_text, input_txt)
    return input_txt

In [None]:
commond.loc[4, 'description']

In [None]:
# Applying pre-processing to user reviews
def preprocessing(df):
  text2 = clean_reviews(list(df['description'].astype('str')))
  text3 = [ta.lower() for ta in text2]
  text4 = [''.join([i if ord(i) < 128 else ' ' for i in t]) for t in text3]
  text5 = [decontracted(u) for u in text4]
  text6 = [stemming_tokenizer(u) for u in text5]
  return text6

In [None]:
commond.loc[4, 'description']

In [None]:
fixedCommond = preprocessing(commond)
fixedTatic = preprocessing(tatic)
#fixedTechnique = preprocessing(technique)

# Step3. Mapping(Bag of Word、TF IDF)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
url = "https://attack.mitre.org/tactics/enterprise/"

tables = pd.read_html(url)

#len(tables)
matrix = tables[0]
matrix
#matrix = matrix.rename(columns={"Description": "description", "Name": "name"})
matrix = pd.DataFrame(matrix)
matrix = matrix.dropna()
matrix.reset_index(drop=True,inplace=True)

dic = {}
for i in range(len(technique)):
  dic[technique.loc[i, 'name']] = []

url = "https://attack.mitre.org/tactics/"
for i in range(len(matrix)):
  tables2 = pd.read_html(url + matrix.loc[i]['ID'] + '/')
  matrix2 = tables2[0]
  matrix2 = pd.DataFrame(matrix2)
  matrix2 = matrix2.dropna()
  matrix2.reset_index(drop=True,inplace=True)
  for j in range(len(matrix2)):
    dic[matrix2.loc[j]['Name']].append(matrix.loc[i]['Name'])

print(dic)
json2 = json.dumps(dic)
print(json2)

f = open("mapTech2Tactic.json", "w+")
f.write(json2)
f.close()

In [None]:
#計算文件與文件的cosine similarity
def similarity(vec):
  mapping = {}
  for i in range(len(fixedTatic), 2631):
    max = 0;
    max_id = 0;
    for j in range(0, len(fixedTatic)):
      score = cosine_similarity(vec[i], vec[j])
      if(max < score):
        max = score
        max_id = j
    if commond.loc[i, 'name']:
      name = commond.loc[i, 'name'].split(' ')
    else:
      name = [commond.loc[i, 'name'],0]
    if max != 0:
      print(name[0], ': is mapping to', technique.loc[max_id, 'name'])
      mapping[name[0]] = technique.loc[max_id, 'name']
    else:
      print(name[0], ': is not mapping to any technique')
      mapping[name[0]] = ''
  '''
  with open("sample.json", "w") as outfile:
    json.dumps(mapping, outfile)
  '''
  json2 = json.dumps(mapping)
  print(json2)

  f = open("sample.json", "w+")
  f.write(json2)
  f.close()

## Bag of Words

In [None]:
count_vect = CountVectorizer(analyzer='word', stop_words = "english")
countdf_user_review = count_vect.fit_transform(fixedTatic + fixedCommond)
print("All tags are:")
print(count_vect.get_feature_names_out())
print("Matrix looks like")
print(countdf_user_review.shape)
print(countdf_user_review.toarray())

In [None]:
similarity(countdf_user_review)

## BoW for N-gram

In [None]:
# Count Vectorizer for N-grams
count_vect2 = CountVectorizer(analyzer='word', ngram_range=(2,3), stop_words = "english")
countdf_user_review2= count_vect2.fit_transform(fixedTatic + fixedCommond)
print("All tags are:")
print(count_vect2.get_feature_names_out())
print("Matrix looks like")
print(countdf_user_review2.shape)
print(countdf_user_review2.toarray())

In [None]:
similarity(countdf_user_review2)

## TF-IDF

In [None]:
# Word level Tf-Idf
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words = "english")
tfidf_user_review = tfidf_vect.fit_transform(fixedTatic + fixedCommond)
print("All tags are:")
print(tfidf_vect.get_feature_names_out())
print("Matrix looks like")
print(tfidf_user_review.shape)
print(tfidf_user_review.toarray())

In [None]:
similarity(tfidf_user_review)

## Tf-Idf for N-grams

In [None]:
# Tf-Idf for N-grams
tfidf_vect2 = TfidfVectorizer(analyzer='word', ngram_range=(2,3), stop_words = "english")
tfidf_user_review2 = tfidf_vect2.fit_transform(fixedTatic + fixedCommond)
print("All tags are:")
print(tfidf_vect2.get_feature_names_out())
print("Matrix looks like")
print(tfidf_user_review2.shape)
print(tfidf_user_review2.toarray())

In [None]:
similarity(tfidf_user_review2)