# Step1. Generate Data

## 抓 linux commond

In [None]:
from bs4 import BeautifulSoup
import requests
import collections
import re

section_base_url = "https://man7.org/linux/man-pages/dir_section_{}.html"

# Define a function to extract command names and descriptions
def get_commands_from_section(section):
    url = section_base_url.format(section)
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        command_entries = soup.find_all('td', valign="top")

        posix_commands = []
        gnu_commands = []

        for entry in command_entries:
            commands_a = entry.find_all('a')
            for command in commands_a:
                if command != None:
                    text = command.text
                    if text == "intro(1)" or text == "intro(8)": #remove intro
                        continue
                    if "(1p)" in text: #posix
                        text = text.replace("(1p)", "")
                        posix_commands.append(text)
                    elif "(1)" in text:
                        text = text.replace("(1)", "")
                        gnu_commands.append(text)
                    elif "(8)" in text:
                        text = text.replace("(8)", "")
                        gnu_commands.append(text)

        # If a command is in both posix page and gnu page , remove the posix one
        same_commands = list(set(posix_commands).intersection(gnu_commands))
        posix_commands = [i for i in posix_commands if i not in same_commands]

        posix_commands.sort()
        gnu_commands.sort()

        all_commands = posix_commands + gnu_commands

        return [posix_commands, gnu_commands, all_commands]

    else:
        print(f"Failed to fetch section {section}. Status code: {response.status_code}")
        return None

section1_commands = get_commands_from_section(1)
section8_commands = get_commands_from_section(8)

#remove tcpdump in section8
same_commands = list(set(section1_commands[2]).intersection(section8_commands[2]))
for i in [0, 1, 2]:
    section8_commands[i] = [j for j in section8_commands[i] if j not in same_commands]

print(f"Section1: {len(section1_commands[2])}\n")
print(f"Section8: {len(section8_commands[2])}\n")
print(f"Section1 + Section8: {len(section1_commands[2]) + len(section8_commands[2])}\n")

import json

man_page_posix_url = "https://man7.org/linux/man-pages/man{}/{}.{}p.html"
man_page_gnu_url = "https://man7.org/linux/man-pages/man{}/{}.{}.html"

def get_command_description_from_command(section, command, is_posix):
    if is_posix:
        url = man_page_posix_url.format(section, command, section)
    else:
        url = man_page_gnu_url.format(section, command, section)
    while True:
        try:
            response = requests.get(url)
        except:
            continue
        break

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        name_a = soup.find('a', id="NAME")
        name = None
        if name_a is not None:
            name_h2 = name_a.parent
            name = name_h2.findNext('pre').text
            name = name.strip()
            name = name.replace("\n", "")
            name = re.sub(r"([\n])|([\ ]{2,})", "", name)


        description = None
        description_a = soup.find('a', id="DESCRIPTION")
        if description_a is not None:
            description_h2 = description_a.parent
            description = description_h2.findNext('pre').text
            description = description.strip()
            description = description.replace("\\","//").replace(r"\u2014",r"-").replace('\u2022','.')
            description = re.sub(r"([\n])|([\ ]{2,})", "", description)


        return {'name':name, 'description':description}

    else:
        print(f"Failed to fetch command {command}. Status code: {response.status_code}")
        return None

def get_descriptions(section, commands, is_posix):
    command_list = []
    for command in commands:
        print(f'current fetch command: {command}\n')
        result = get_command_description_from_command(section, command, is_posix)
        if result != None:
            result['command'] = command
            result['section'] = section
            command_list.append(result)
    return command_list

res1 = get_descriptions(1, section1_commands[0], True)
res2 = get_descriptions(1, section1_commands[1], False)
res3 = get_descriptions(8, section8_commands[0], True)
res4 = get_descriptions(8, section8_commands[1], False)

res = res1+res2+res3+res4
json = json.dumps(res, sort_keys=True, indent=4)
print(json)

f = open("result.json", "w+")
f.write(json)
f.close()

## 讀取檔案
* 把抓到的commod(result.json)放到google 雲端硬碟的root
* 把[github]((https://github.com/mitre/cti/tree/master/enterprise-attack/x-mitre-tactic)) 的 14個tactic.json 放到google 雲端硬碟root/tactic

讀取 linux commond

In [None]:
import os
import json
import pathlib
import pandas as pd
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dirPath = r"drive/MyDrive/"
f = open(dirPath + 'result.json')
data = json.load(f)

# Use pd.json_normalize to convert the JSON to a DataFrame
commond = pd.json_normalize(data, meta=['command', 'description', 'name', 'section'])

# Rename the columns for clarity
commond.columns = ['command', 'description', 'name', 'section']

# Display the DataFrame
print(commond)
commond = pd.DataFrame(commond)

      command                                        description  \
0       admin  The admin utility shall create new SCCS files ...   
1       alias  The alias utility shall create or redefine ali...   
2         asa  The asa utility shall write its input files to...   
3          at  The at utility shall read commands from standa...   
4         awk  The awk utility shall execute programs written...   
...       ...                                                ...   
2626  yum2dnf                                               None   
2627    yumdb  This command is used to query and alter the yu...   
2628    zdump  The zdump program prints the current time in e...   
2629      zic  The zic program reads text from the file(s) na...   
2630  zramctl  zramctl is used to quickly set up zram device ...   

                                                   name  section  
0     admin — create and administer SCCS files (DEVE...        1  
1                     alias — define or display a

讀取 tactic

In [None]:
dirPath = r"drive/MyDrive/tactic"

tmp = []
for f in os.listdir(dirPath):
  if pathlib.Path(f).suffix == ".json":
    data = json.load(open(os.path.join(dirPath, f)))
    tmp.append([data['objects'][0]['name'], data['objects'][0]['description']])

tatic = pd.DataFrame(tmp)
tatic.columns = ['name', 'description']
print(tatic)

                    name                                        description
0      Credential Access  The adversary is trying to steal account names...
1              Execution  The adversary is trying to run malicious code....
2                 Impact  The adversary is trying to manipulate, interru...
3            Persistence  The adversary is trying to maintain their foot...
4   Privilege Escalation  The adversary is trying to gain higher-level p...
5       Lateral Movement  The adversary is trying to move through your e...
6        Defense Evasion  The adversary is trying to avoid being detecte...
7           Exfiltration  The adversary is trying to steal data.\n\nExfi...
8              Discovery  The adversary is trying to figure out your env...
9   Resource Development  The adversary is trying to establish resources...
10        Reconnaissance  The adversary is trying to gather information ...
11   Command and Control  The adversary is trying to communicate with co...
12        In

## 抓 Technique

In [None]:
import pandas as pd
url = "https://attack.mitre.org/techniques/enterprise/"

tables = pd.read_html(url)

len(tables)
technique = tables[0]

technique = technique.rename(columns={"Description": "description", "Name": "name"})
technique = pd.DataFrame(technique)
technique = technique.dropna()
technique.reset_index(drop=True,inplace=True)
technique

Unnamed: 0,ID,ID.1,name,description
0,T1548,T1548,Abuse Elevation Control Mechanism,Adversaries may circumvent mechanisms designed...
1,T1134,T1134,Access Token Manipulation,Adversaries may modify access tokens to operat...
2,T1531,T1531,Account Access Removal,Adversaries may interrupt availability of syst...
3,T1087,T1087,Account Discovery,Adversaries may attempt to get a listing of va...
4,T1098,T1098,Account Manipulation,Adversaries may manipulate accounts to maintai...
...,...,...,...,...
196,T1497,T1497,Virtualization/Sandbox Evasion,Adversaries may employ various means to detect...
197,T1600,T1600,Weaken Encryption,Adversaries may compromise a network device’s ...
198,T1102,T1102,Web Service,"Adversaries may use an existing, legitimate ex..."
199,T1047,T1047,Windows Management Instrumentation,Adversaries may abuse Windows Management Instr...


## Technique map to Tactic

In [None]:
import pandas as pd
url = "https://attack.mitre.org/matrices/enterprise/"

tables = pd.read_html(url)

len(tables)
matrix = tables[0]

matrix = pd.DataFrame(matrix)

# Step2. Pre-Processing(SnowballStemmer)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import numpy as np

pd.options.display.max_columns = 30
%matplotlib inline

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"early access review", "early access review ", phrase)
    phrase = re.sub(r"\+", " + ", phrase)
    phrase = re.sub(r"\-", " - ", phrase)
    phrase = re.sub(r"/10", "/10 ", phrase)
    phrase = re.sub(r"10/", " 10/", phrase)
    return phrase

In [None]:
stemmer = SnowballStemmer("english")
def stemming_tokenizer(str_input):
  words = re.sub(r"[^a-zA-Z]{2,}", " ", str_input).lower().split()
  words = [stemmer.stem(word) for word in words]
  return " ".join(words)

In [None]:
def clean_reviews(lst):
    # remove URL links (httpxxx)
    lst = np.vectorize(remove_pattern)(lst, "https?://[A-Za-z0-9./]*")
    # remove special characters, numbers, punctuations (except for #)
    lst = np.core.defchararray.replace(lst, "[^a-zA-Z]", " ")
    # remove amp with and
    lst = np.vectorize(replace_pattern)(lst, "amp", "and")
    # remove hashtags
    lst = np.vectorize(remove_pattern)(lst, "#[A-Za-z0-9]+")
    lst = np.vectorize(remove_pattern)(lst, "#[\w]*")
    return lst
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt
def replace_pattern(input_txt, pattern, replace_text):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, replace_text, input_txt)
    return input_txt

In [None]:
# Applying pre-processing to user reviews
def preprocessing(df):
  text2 = clean_reviews(list(df['description'].astype('str')))
  text3 = [ta.lower() for ta in text2]
  text4 = [''.join([i if ord(i) < 128 else ' ' for i in t]) for t in text3]
  text5 = [decontracted(u) for u in text4]
  text6 = [stemming_tokenizer(u) for u in text5]
  return text6

In [None]:
fixedCommond = preprocessing(commond)
fixedTatic = preprocessing(tatic)
fixedTechnique = preprocessing(technique)

# Step3. Mapping(Bag of Word、TF IDF)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
url = "https://attack.mitre.org/tactics/enterprise/"

tables = pd.read_html(url)

#len(tables)
matrix = tables[0]
matrix
#matrix = matrix.rename(columns={"Description": "description", "Name": "name"})
matrix = pd.DataFrame(matrix)
matrix = matrix.dropna()
matrix.reset_index(drop=True,inplace=True)

dic = {}
for i in range(len(technique)):
  dic[technique.loc[i, 'name']] = []

url = "https://attack.mitre.org/tactics/"
for i in range(len(matrix)):
  tables2 = pd.read_html(url + matrix.loc[i]['ID'] + '/')
  matrix2 = tables2[0]
  matrix2 = pd.DataFrame(matrix2)
  matrix2 = matrix2.dropna()
  matrix2.reset_index(drop=True,inplace=True)
  for j in range(len(matrix2)):
    dic[matrix2.loc[j]['Name']].append(matrix.loc[i]['Name'])

print(dic)
json2 = json.dumps(dic)
print(json2)

f = open("mapTech2Tactic.json", "w+")
f.write(json2)
f.close()

{'Abuse Elevation Control Mechanism': ['Privilege Escalation', 'Defense Evasion'], 'Access Token Manipulation': ['Privilege Escalation', 'Defense Evasion'], 'Account Access Removal': ['Impact'], 'Account Discovery': ['Discovery'], 'Account Manipulation': ['Persistence', 'Privilege Escalation'], 'Acquire Access': ['Resource Development'], 'Acquire Infrastructure': ['Resource Development'], 'Active Scanning': ['Reconnaissance'], 'Adversary-in-the-Middle': ['Credential Access', 'Collection'], 'Application Layer Protocol': ['Command and Control'], 'Application Window Discovery': ['Discovery'], 'Archive Collected Data': ['Collection'], 'Audio Capture': ['Collection'], 'Automated Collection': ['Collection'], 'Automated Exfiltration': ['Exfiltration'], 'BITS Jobs': ['Persistence', 'Defense Evasion'], 'Boot or Logon Autostart Execution': ['Persistence', 'Privilege Escalation'], 'Boot or Logon Initialization Scripts': ['Persistence', 'Privilege Escalation'], 'Browser Extensions': ['Persistence'

In [None]:
#計算文件與文件的cosine similarity
def similarity(vec, x, y):
  mapping = {}
  for i in range(0, len(y)):
    max = 0;
    max_id = 0;
    for j in range(0, len(x)):
      score = cosine_similarity(vec[i+len(x)], vec[j])
      if(max < score):
        max = score
        max_id = j
    if commond.loc[i, 'name']:
      name = commond.loc[i, 'name'].split(' ')
    else:
      name = [commond.loc[i, 'name'],0]
    if max != 0:
      #print(name[0], ': is mapping to', x.loc[max_id, 'name'])
      mapping[name[0]] = x.loc[max_id, 'name']
    else:
      #print(name[0], ': is not mapping to any technique')
      mapping[name[0]] = ''
  '''
  with open("sample.json", "w") as outfile:
    json.dumps(mapping, outfile)
  '''
  json2 = json.dumps(mapping)
  print(json2)

  f = open("sample.json", "w")
  f.write(json2)
  f.close()
  return mapping

## Bag of Words

In [None]:
count_vect = CountVectorizer(analyzer='word', stop_words = "english")
countdf_user_review = count_vect.fit_transform(fixedTatic + fixedCommond)

In [None]:
x= similarity(countdf_user_review, tatic, commond)

{"admin": "Resource Development", "alias": "Discovery", "asa": "Command and Control", "at": "Initial Access", "awk": "Persistence", "batch": "Initial Access", "bc": "Collection", "bg": "Execution", "break": "Command and Control", "c99": "Execution", "cd": "Discovery", "cflow": "Collection", "colon": "Command and Control", "command": "Command and Control", "compress": "Persistence", "continue": "Initial Access", "ctags": "Resource Development", "cxref": "Collection", "delta": "Initial Access", "dot": "Command and Control", "ed": "Command and Control", "eval": "Command and Control", "ex": "Resource Development", "exec": "Command and Control", "exit": "Discovery", "export": "Resource Development", "fc": "Command and Control", "fg": "Discovery", "fort77": "Privilege Escalation", "gencat": "Collection", "get": "Privilege Escalation", "getconf": "Persistence", "getopts": "Discovery", "hash": "Command and Control", "jobs": "Discovery", "lex": "Persistence", "m4": "Exfiltration", "mailx": "Res

In [None]:
import re
y = {}
with open("linux_commond.txt", "r") as f:
  for line in f:
    tac = line.split(':')
    y[tac[0]]=tac[1].split(',')[0]

In [None]:
total = 0
correct = 0
for word in y:
  try:
    if y[word] == x[word]:
      correct = correct+1

  except:
    continue
  total = total +1
print(correct/total*100)
print(total)

6.748466257668712
163


In [None]:
count_vect = CountVectorizer(analyzer='word', stop_words = "english")
countdf_user_review = count_vect.fit_transform(fixedTechnique + fixedCommond)
print("All tags are:")
print(count_vect.get_feature_names_out())
print("Matrix looks like")
print(countdf_user_review.shape)
print(countdf_user_review.toarray())

All tags are:
['a0c' 'a2ec4e' 'a4ab' ... 'zxproxi' 'zz' 'zzthre']
Matrix looks like
(2832, 32727)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
x= similarity(countdf_user_review, technique, commond)

{"admin": "Data from Local System", "alias": "Command and Scripting Interpreter", "asa": "File and Directory Permissions Modification", "at": "File and Directory Permissions Modification", "awk": "Indirect Command Execution", "batch": "File and Directory Permissions Modification", "bc": "Input Capture", "bg": "BITS Jobs", "break": "Indirect Command Execution", "c99": "Data from Local System", "cd": "File and Directory Permissions Modification", "cflow": "Data from Local System", "colon": "Indirect Command Execution", "command": "Indirect Command Execution", "compress": "Direct Volume Access", "continue": "Trusted Developer Utilities Proxy Execution", "ctags": "Trusted Developer Utilities Proxy Execution", "cxref": "Container and Resource Discovery", "delta": "Data from Local System", "dot": "Data from Network Shared Drive", "ed": "Indirect Command Execution", "eval": "Indirect Command Execution", "ex": "Indirect Command Execution", "exec": "Command and Scripting Interpreter", "exit": "

In [None]:
f = open('sample.json')
data1 = json.load(f)
f.close()
f = open('mapTech2Tactic.json')
data2 = json.load(f)
f.close()
#data1 = x
data3={}
for word in data1:
  try:
    data3[word] = data2[data1[word]]
  except:
    continue
x = data3
print(x)

{'admin': ['Collection'], 'alias': ['Execution'], 'asa': ['Defense Evasion'], 'at': ['Defense Evasion'], 'awk': ['Defense Evasion'], 'batch': ['Defense Evasion'], 'bc': ['Credential Access', 'Collection'], 'bg': ['Persistence', 'Defense Evasion'], 'break': ['Defense Evasion'], 'c99': ['Collection'], 'cd': ['Defense Evasion'], 'cflow': ['Collection'], 'colon': ['Defense Evasion'], 'command': ['Defense Evasion'], 'compress': ['Defense Evasion'], 'continue': ['Defense Evasion'], 'ctags': ['Defense Evasion'], 'cxref': ['Discovery'], 'delta': ['Collection'], 'dot': ['Collection'], 'ed': ['Defense Evasion'], 'eval': ['Defense Evasion'], 'ex': ['Defense Evasion'], 'exec': ['Execution'], 'exit': ['Defense Evasion'], 'export': ['Discovery'], 'fc': ['Defense Evasion'], 'fg': ['Persistence', 'Defense Evasion'], 'fort77': ['Collection'], 'gencat': ['Impact'], 'get': ['Defense Evasion'], 'getconf': ['Defense Evasion'], 'getopts': ['Execution'], 'hash': ['Defense Evasion'], 'jobs': ['Privilege Escal

In [None]:
total = 0
correct = 0
y_true = []
y_pred = []
for word in y:
  try:
    if word in x:
      flag = False
      for pred in x[word]:
        #print(pred, x[word])
        if pred == y[word]:
          correct = correct+1
          flag = True
      total = total +1
      y_true.append(y[word])
      if flag:
        y_pred.append(y[word])
      else:
        print(word, x[word], data1[word])
        if x[word] == '':
          y_pred.append('null')
        else:
          y_pred.append(x[word])
  except:
    continue
print(correct/total*100)
print(total)

chown ['Discovery'] Permission Groups Discovery
cmp ['Defense Evasion'] Plist File Modification
cut ['Impact'] Data Destruction
ln ['Execution'] User Execution
less ['Defense Evasion'] Indirect Command Execution
locate ['Collection'] Data from Local System
lsattr ['Defense Evasion'] File and Directory Permissions Modification
mktemp ['Defense Evasion'] File and Directory Permissions Modification
mv ['Collection'] Data from Local System
od ['Exfiltration'] Data Transfer Size Limits
paste ['Collection'] Data from Local System
patch ['Collection'] Data from Local System
split ['Defense Evasion'] Plist File Modification
tee ['Defense Evasion'] File and Directory Permissions Modification
touch ['Discovery'] System Time Discovery
umask ['Lateral Movement'] Lateral Tool Transfer
cp ['Collection'] Data from Local System
scp ['Collection'] Data from Network Shared Drive
awk ['Defense Evasion'] Indirect Command Execution
read ['Defense Evasion'] Indirect Command Execution
col ['Command and Contr

## BoW for N-gram

In [None]:
# Count Vectorizer for N-grams
count_vect2 = CountVectorizer(analyzer='word', ngram_range=(2,3), stop_words = "english")
countdf_user_review2= count_vect2.fit_transform(fixedTatic + fixedCommond)

In [None]:
x = similarity(countdf_user_review2, tatic, commond)

In [None]:
import re
y = {}
with open("linux_commond.txt", "r") as f:
  for line in f:
    tac = line.split(':')
    y[tac[0]]=tac[1].split(',')[0]

In [None]:
total = 0
correct = 0
for word in y:
  try:
    if y[word] == x[word]:
      correct = correct+1

  except:
    continue
  total = total +1
print(correct/total*100)
print(total)

0.0
163


In [None]:
# Count Vectorizer for N-grams
count_vect2 = CountVectorizer(analyzer='word', ngram_range=(2,3), stop_words = "english")
countdf_user_review2= count_vect2.fit_transform(fixedTechnique + fixedCommond)

In [None]:
x = similarity(countdf_user_review2, technique, commond)

{"admin": "Reflective Code Loading", "alias": "Data Staged", "asa": "", "at": "Inhibit System Recovery", "awk": "Boot or Logon Autostart Execution", "batch": "Inhibit System Recovery", "bc": "", "bg": "Rogue Domain Controller", "break": "Indirect Command Execution", "c99": "Obfuscated Files or Information", "cd": "Remote Services", "cflow": "Data from Local System", "colon": "Communication Through Removable Media", "command": "Dynamic Resolution", "compress": "", "continue": "", "ctags": "Data from Local System", "cxref": "", "delta": "Hide Artifacts", "dot": "Container Administration Command", "ed": "", "eval": "", "ex": "", "exec": "Traffic Signaling", "exit": "Pre-OS Boot", "export": "Container Administration Command", "fc": "System Time Discovery", "fg": "Rogue Domain Controller", "fort77": "Obfuscated Files or Information", "gencat": "Exfiltration Over Web Service", "get": "", "getconf": "", "getopts": "", "hash": "", "jobs": "", "lex": "", "m4": "", "mailx": "User Execution", "pa

In [None]:
f = open('sample.json')
data1 = json.load(f)
f.close()
f = open( 'mapTech2Tactic.json')
data2 = json.load(f)
f.close()
for word in data1:
  try:
    data1[word] = data2[data1[word]]
  except:
    continue
x2 = data1

In [None]:
total = 0
correct = 0
y_true = []
y_pred = []
for word in y:
  try:
    if word in x2:
      flag = False
      for pred in x2[word]:
        #print(pred, y[word])
        if pred == y[word]:
          correct = correct+1
          flag = True
      total = total +1
      y_true.append(y[word])
      if flag:
        y_pred.append(y[word])
      else:
        #print(word,x2[word])
        y_pred.append(x2[word][0])
  except:
    continue
print(correct/total*100)
print(total)

11.65644171779141
163


## TF-IDF

In [None]:
# Word level Tf-Idf
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words = "english")
tfidf_user_review = tfidf_vect.fit_transform(fixedTatic + fixedCommond)

In [None]:
x = similarity(tfidf_user_review, tatic, commond)

{"admin": "Privilege Escalation", "alias": "Discovery", "asa": "Command and Control", "at": "Lateral Movement", "awk": "Persistence", "batch": "Initial Access", "bc": "Collection", "bg": "Discovery", "break": "Discovery", "c99": "Execution", "cd": "Discovery", "cflow": "Collection", "colon": "Command and Control", "command": "Command and Control", "compress": "Persistence", "continue": "Initial Access", "ctags": "Collection", "cxref": "Collection", "delta": "Persistence", "dot": "Command and Control", "ed": "Discovery", "eval": "Command and Control", "ex": "Discovery", "exec": "Command and Control", "exit": "Discovery", "export": "Execution", "fc": "Privilege Escalation", "fg": "Discovery", "fort77": "Privilege Escalation", "gencat": "Collection", "get": "Reconnaissance", "getconf": "Privilege Escalation", "getopts": "Command and Control", "hash": "Command and Control", "jobs": "Discovery", "lex": "Execution", "m4": "Exfiltration", "mailx": "Collection", "pax": "Exfiltration", "prs": "

In [None]:
import re
y = {}
with open("linux_commond.txt", "r") as f:
  for line in f:
    tac = line.split(':')
    y[tac[0]]=tac[1].split(',')[0]

In [None]:
total = 0
correct = 0
for word in y:
  try:
    if y[word] == x[word]:
      correct = correct+1

  except:
    continue
  total = total +1
print(correct/total*100)
print(total)

15.337423312883436
163


In [None]:
# Word level Tf-Idf
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words = "english")
tfidf_user_review = tfidf_vect.fit_transform(fixedTechnique + fixedCommond)

In [None]:
x = similarity(tfidf_user_review, technique, commond)

{"admin": "File and Directory Permissions Modification", "alias": "Command and Scripting Interpreter", "asa": "File and Directory Permissions Modification", "at": "BITS Jobs", "awk": "Indirect Command Execution", "batch": "File and Directory Permissions Modification", "bc": "Input Capture", "bg": "BITS Jobs", "break": "Indirect Command Execution", "c99": "Reflective Code Loading", "cd": "File and Directory Permissions Modification", "cflow": "Obfuscated Files or Information", "colon": "Indirect Command Execution", "command": "Indirect Command Execution", "compress": "Archive Collected Data", "continue": "Reflective Code Loading", "ctags": "Trusted Developer Utilities Proxy Execution", "cxref": "Data Staged", "delta": "Hardware Additions", "dot": "Command and Scripting Interpreter", "ed": "Indirect Command Execution", "eval": "Indirect Command Execution", "ex": "Gather Victim Host Information", "exec": "Command and Scripting Interpreter", "exit": "Indirect Command Execution", "export": 

In [None]:
f = open('sample.json')
data1 = json.load(f)
f.close()
f = open('mapTech2Tactic.json')
data2 = json.load(f)
f.close()
#data1 = x
data3={}
for word in data1:
  try:
    data3[word] = data2[data1[word]]
  except:
    continue
x = data3
print(x)

{'admin': ['Defense Evasion'], 'alias': ['Execution'], 'asa': ['Defense Evasion'], 'at': ['Persistence', 'Defense Evasion'], 'awk': ['Defense Evasion'], 'batch': ['Defense Evasion'], 'bc': ['Credential Access', 'Collection'], 'bg': ['Persistence', 'Defense Evasion'], 'break': ['Defense Evasion'], 'c99': ['Defense Evasion'], 'cd': ['Defense Evasion'], 'cflow': ['Defense Evasion'], 'colon': ['Defense Evasion'], 'command': ['Defense Evasion'], 'compress': ['Collection'], 'continue': ['Defense Evasion'], 'ctags': ['Defense Evasion'], 'cxref': ['Collection'], 'delta': ['Initial Access'], 'dot': ['Execution'], 'ed': ['Defense Evasion'], 'eval': ['Defense Evasion'], 'ex': ['Reconnaissance'], 'exec': ['Execution'], 'exit': ['Defense Evasion'], 'export': ['Discovery'], 'fc': ['Defense Evasion'], 'fg': ['Persistence', 'Defense Evasion'], 'fort77': ['Defense Evasion'], 'gencat': ['Exfiltration'], 'get': ['Credential Access'], 'getconf': ['Defense Evasion'], 'getopts': ['Execution'], 'hash': ['Lat

In [None]:
total = 0
correct = 0
y_true = []
y_pred = []
for word in y:
  try:
    if word in x:
      flag = False
      for pred in x[word]:
        #print(pred, y[word])
        if pred == y[word]:
          correct = correct+1
          flag = True
      total = total +1
      y_true.append(y[word])
      if flag:
        y_pred.append(y[word])
      else:
        print(word, x[word], data1[word])
        y_pred.append(x[word][0])
  except:
    continue
print(correct/total*100)
print(total)

cat ['Defense Evasion'] Direct Volume Access
chgrp ['Discovery'] Permission Groups Discovery
chmod ['Discovery'] Permission Groups Discovery
chown ['Discovery'] Permission Groups Discovery
cksum ['Command and Control'] Encrypted Channel
cmp ['Credential Access', 'Collection'] Input Capture
diff ['Defense Evasion'] File and Directory Permissions Modification
file ['Command and Control'] Data Encoding
find ['Defense Evasion'] Template Injection
indent ['Discovery'] Process Discovery
cut ['Reconnaissance'] Gather Victim Network Information
ln ['Execution'] User Execution
less ['Defense Evasion'] Indirect Command Execution
locate ['Exfiltration'] Scheduled Transfer
lsattr ['Defense Evasion'] File and Directory Permissions Modification
mktemp ['Defense Evasion'] Template Injection
more ['Collection'] Screen Capture
mv ['Exfiltration'] Transfer Data to Cloud Account
od ['Exfiltration'] Data Transfer Size Limits
paste ['Defense Evasion'] Direct Volume Access
patch ['Discovery'] System Informa

## Tf-Idf for N-grams

In [None]:
# Tf-Idf for N-grams
tfidf_vect2 = TfidfVectorizer(analyzer='word', ngram_range=(2,3), stop_words = "english")
tfidf_user_review2 = tfidf_vect2.fit_transform(fixedTatic + fixedCommond)

In [None]:
x = similarity(tfidf_user_review2, tatic, commond)

{"admin": "", "alias": "", "asa": "", "at": "", "awk": "", "batch": "", "bc": "", "bg": "", "break": "", "c99": "", "cd": "", "cflow": "", "colon": "", "command": "", "compress": "", "continue": "", "ctags": "", "cxref": "", "delta": "", "dot": "", "ed": "", "eval": "", "ex": "", "exec": "", "exit": "", "export": "", "fc": "", "fg": "", "fort77": "", "gencat": "", "get": "", "getconf": "", "getopts": "", "hash": "", "jobs": "", "lex": "", "m4": "", "mailx": "", "pax": "", "prs": "", "qalter": "", "qdel": "", "qhold": "", "qmove": "", "qmsg": "", "qrerun": "", "qrls": "", "qselect": "", "qsig": "", "qstat": "", "qsub": "", "read": "", "readonly": "", "return": "", "rmdel": "", "sact": "", "sccs": "", "set": "", "sh": "", "shift": "", "talk": "Privilege Escalation", "times": "", "trap": "", "type": "", "ulimit": "", "umask": "", "unalias": "", "uncompress": "", "unget": "", "unset": "", "uucp": "Execution", "uudecode": "", "uuencode": "", "uustat": "", "uux": "", "val": "", "vi": "", "wa

In [None]:
import re
y = {}
with open("linux_commond.txt", "r") as f:
  for line in f:
    tac = line.split(':')
    y[tac[0]]=tac[1].split(',')[0]

In [None]:
total = 0
correct = 0
for word in y:
  try:
    if y[word] == x[word]:
      correct = correct+1

  except:
    continue
  total = total +1
print(correct/total*100)
print(total)

0.0
163


In [None]:
# Tf-Idf for N-grams
tfidf_vect2 = TfidfVectorizer(analyzer='word', ngram_range=(2,3), stop_words = "english")
tfidf_user_review2 = tfidf_vect2.fit_transform(fixedTechnique + fixedCommond)

In [None]:
x = similarity(tfidf_user_review, technique, commond)

{"admin": "Scheduled Task/Job", "alias": "Command and Scripting Interpreter", "asa": "File and Directory Permissions Modification", "at": "BITS Jobs", "awk": "Indirect Command Execution", "batch": "File and Directory Permissions Modification", "bc": "Input Capture", "bg": "BITS Jobs", "break": "Indirect Command Execution", "c99": "Reflective Code Loading", "cd": "File and Directory Permissions Modification", "cflow": "Obfuscated Files or Information", "colon": "Indirect Command Execution", "command": "Indirect Command Execution", "compress": "Archive Collected Data", "continue": "Reflective Code Loading", "ctags": "Trusted Developer Utilities Proxy Execution", "cxref": "Data Staged", "delta": "Hardware Additions", "dot": "Command and Scripting Interpreter", "ed": "Indirect Command Execution", "eval": "Indirect Command Execution", "ex": "Gather Victim Host Information", "exec": "Traffic Signaling", "exit": "Indirect Command Execution", "export": "System Time Discovery", "fc": "Indirect 

In [None]:
f = open('sample.json')
data1 = json.load(f)
f.close()
f = open( 'mapTech2Tactic.json')
data2 = json.load(f)
f.close()
for word in data1:
  try:
    data1[word] = data2[data1[word]]
  except:
    continue
x2 = data1

In [None]:
total = 0
correct = 0
y_true = []
y_pred = []
print(y)
for word in y:
  if word in x2:
    flag = False
    for pred in x2[word]:
      #print(pred, y[word])
      if pred == y[word]:
        correct = correct+1
        flag = True
    total = total +1
print(correct/total*100)
print(total)

{'cat': 'Discovery', 'chattr': 'Privilege Escalation', 'chgrp': 'Privilege Escalation', 'chmod': 'Privilege Escalation', 'chown': 'Privilege Escalation', 'cksum': 'Discovery', 'cmp': 'Discovery', 'diff': 'Discovery', 'diffstat': 'Discovery', 'file': 'Discovery', 'find': 'Discovery', 'git': 'Discovery', 'gitview': 'Discovery', 'indent': 'Impact', 'cut': 'Discovery', 'ln': 'Defense Evasion', 'less': 'Discovery', 'locate': 'Discovery', 'lsattr': 'Discovery', 'mattrib': 'Discovery', 'mc': 'Discovery', 'mdel': 'Impact', 'mdir': 'Discovery', 'mktemp': 'Lateral Movement', 'more': 'Discovery', 'mmove': 'Lateral Movement', 'mread': 'Lateral Movement', 'mren': 'Lateral Movement', 'mtools': 'Impact', 'mtoolstest': 'Discovery', 'mv': 'Lateral Movement', 'od': 'Discovery', 'paste': 'Lateral Movement', 'patch': 'Impact', 'rcp': 'Exfiltration', 'rm': 'Impact', 'slocate': 'Collection', 'split': 'Impact', 'tee': 'Exfiltration', 'tmpwatch': 'Impact', 'touch': 'Impact', 'umask': 'Collection', 'which': 'C