In [2]:
import os
import requests

def download_files_from_directory(directory_url, save_directory, headers=None):
    # Get the contents of the directory
    directory_contents = requests.get(directory_url, headers=headers).json()

    # Loop through each item in the directory contents
    for file_item in directory_contents:
        if file_item['type'] == 'file':
            # Download the file
            file_url = file_item['download_url']
            file_name = file_item['name']

            print(f"Downloading: {file_name}")

            response = requests.get(file_url, headers=headers)

            if response.status_code == 200:
                # Save the file
                with open(os.path.join(save_directory, file_name), 'wb') as file:
                    file.write(response.content)
                print(f"File saved to: {file_name}")
            else:
                print(f"Failed to download: {file_name}. Status code: {response.status_code}")
        elif file_item['type'] == 'dir':
            # Recursively download files from subdirectory
            subdir_url = file_item['url']
            subdir_name = file_item['name']
            subdir_save_directory = os.path.join(save_directory, subdir_name)

            print(f"Entering subdirectory: {subdir_name}")

            # Create subdirectory if it doesn't exist
            os.makedirs(subdir_save_directory, exist_ok=True)

            download_files_from_directory(subdir_url, subdir_save_directory, headers=headers)

            print(f"Exiting subdirectory: {subdir_name}")

# GitHub repository URL
repository_url = 'https://api.github.com/repos/Azure/Azure-Sentinel/contents/Hunting%20Queries/Microsoft%20365%20Defender'


headers = {'Authorization': 'Bearer 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'}

# Make a request to get the contents of the repository
response = requests.get(repository_url, headers=headers)

if response.status_code == 200:
    # Parse the JSON response
    contents = response.json()

    # Create a directory to save files
    save_directory = 'downloaded_files'
    os.makedirs(save_directory, exist_ok=True)

    # Loop through each item in the contents
    for item in contents:
        if item['type'] == 'file':
            # Download the file
            file_url = item['download_url']
            file_name = item['name']

            print(f"Downloading: {file_name}")

            response = requests.get(file_url, headers=headers)

            if response.status_code == 200:
                # Save the file
                with open(os.path.join(save_directory, file_name), 'wb') as file:
                    file.write(response.content)
                print(f"File saved to: {file_name}")
            else:
                print(f"Failed to download: {file_name}. Status code: {response.status_code}")
        elif item['type'] == 'dir':
            # Recursively download files from subdirectory
            subdir_url = item['url']
            subdir_name = item['name']
            subdir_save_directory = os.path.join(save_directory, subdir_name)

            print(f"Entering directory: {subdir_name}")

            # Create subdirectory if it doesn't exist
            os.makedirs(subdir_save_directory, exist_ok=True)

            download_files_from_directory(subdir_url, subdir_save_directory, headers=headers)

            print(f"Exiting directory: {subdir_name}")

else:
    print(f"Failed to access repository. Status code: {response.status_code}")


Entering directory: ASR rules
Downloading: ASR-rules-categorized-detection-graph.yaml
File saved to: ASR-rules-categorized-detection-graph.yaml
Exiting directory: ASR rules
Entering directory: Campaigns
Downloading: APT Baby Shark.yaml
File saved to: APT Baby Shark.yaml
Downloading: APT29 thinktanks.yaml
File saved to: APT29 thinktanks.yaml
Downloading: Abuse.ch Recent Threat Feed (1).yaml
File saved to: Abuse.ch Recent Threat Feed (1).yaml
Downloading: Abuse.ch Recent Threat Feed.yaml
File saved to: Abuse.ch Recent Threat Feed.yaml
Downloading: Abusing settingcontent-ms.yaml
File saved to: Abusing settingcontent-ms.yaml
Entering subdirectory: Bazacall
Downloading: Bazacall Emails.yaml
File saved to: Bazacall Emails.yaml
Downloading: Cobalt Strike Lateral Movement.yaml
File saved to: Cobalt Strike Lateral Movement.yaml
Downloading: Dropping payload via certutil.yaml
File saved to: Dropping payload via certutil.yaml
Downloading: Excel Macro Execution.yaml
File saved to: Excel Macro Exec

In [4]:
import os
import yaml
import shutil
import time

def parse_yaml_and_organize(directory_path, output_directory):
    for root, dirs, files in os.walk(directory_path):
        for file_name in files:
            if file_name.endswith(".yaml"):
                file_path = os.path.join(root, file_name)

                print(f"Parsing YAML file: {file_path}")

                with open(file_path, 'r', encoding='utf-8') as file:
                    try:
                        yaml_data = yaml.safe_load(file)

                        # Check if requiredDataConnectors is a list
                        required_data_connectors = yaml_data.get('requiredDataConnectors', [])
                        if not isinstance(required_data_connectors, list):
                            required_data_connectors = [required_data_connectors]

                        for connector in required_data_connectors:
                            data_types = connector.get('dataTypes', [])

                            for data_type in data_types:
                                data_type_folder = os.path.join(output_directory, data_type)

                                # Create a folder for the data type if it doesn't exist
                                os.makedirs(data_type_folder, exist_ok=True)

                                # Copy the YAML file to the corresponding data type folder
                                new_file_path = os.path.join(data_type_folder, file_name)

                                # Add a delay before attempting to copy the file
                                time.sleep(0.1)

                                shutil.copy(file_path, new_file_path)

                                print(f"Copied {file_name} to {data_type_folder}")

                    except yaml.YAMLError as e:
                        print(f"Error parsing YAML file {file_path}: {e}")

# Path to the directory where files are downloaded
downloaded_directory = 'downloaded_files'
# Output directory for organized files
output_directory = 'files_parsed_by_nahed'

# Ensure the output directory exists and set permissions
os.makedirs(output_directory, exist_ok=True)
os.chmod(output_directory, 0o777)

# Parse YAML files and organize based on data types
parse_yaml_and_organize(downloaded_directory, output_directory)


Parsing YAML file: downloaded_files\ASR rules\ASR-rules-categorized-detection-graph.yaml
Copied ASR-rules-categorized-detection-graph.yaml to files_parsed_by_nahed\DeviceEvents
Parsing YAML file: downloaded_files\Campaigns\Abuse.ch Recent Threat Feed (1).yaml
Copied Abuse.ch Recent Threat Feed (1).yaml to files_parsed_by_nahed\DeviceProcessEvents
Copied Abuse.ch Recent Threat Feed (1).yaml to files_parsed_by_nahed\DeviceFileEvents
Copied Abuse.ch Recent Threat Feed (1).yaml to files_parsed_by_nahed\DeviceImageLoadEvents
Parsing YAML file: downloaded_files\Campaigns\Abuse.ch Recent Threat Feed.yaml
Copied Abuse.ch Recent Threat Feed.yaml to files_parsed_by_nahed\DeviceProcessEvents
Copied Abuse.ch Recent Threat Feed.yaml to files_parsed_by_nahed\DeviceFileEvents
Copied Abuse.ch Recent Threat Feed.yaml to files_parsed_by_nahed\DeviceImageLoadEvents
Parsing YAML file: downloaded_files\Campaigns\Abusing settingcontent-ms.yaml
Copied Abusing settingcontent-ms.yaml to files_parsed_by_nahed\D

In [3]:
!pip install pymongo





[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['myGithubBase']  # Replace with your MongoDB database name
collection = db['threadCollection']  # Replace with your MongoDB collection name

In [6]:
import os
import yaml

def store_yaml_files_in_mongodb(directory_path):
    for root, dirs, files in os.walk(directory_path):
        for file_name in files:
            if file_name.endswith(".yaml"):
                file_path = os.path.join(root, file_name)

                print(f"Storing YAML file in MongoDB: {file_path}")

                with open(file_path, 'r', encoding='utf-8') as file:
                    try:
                        yaml_data = yaml.safe_load(file)

                        # Insert the YAML data into MongoDB
                        collection.insert_one(yaml_data)

                        print(f"YAML data stored in MongoDB: {file_path}")

                    except yaml.YAMLError as e:
                        print(f"Error parsing YAML file {file_path}: {e}")

# Path to the directory where YAML files are organized
organized_directory = 'files_parsed_by_nahed'

# Store YAML files in MongoDB
store_yaml_files_in_mongodb(organized_directory)

# Close MongoDB connection
client.close()


Storing YAML file in MongoDB: files_parsed_by_nahed\AADSignInEventsBeta\EncodedDomainURL [Nobelium].yaml


ServerSelectionTimeoutError: localhost:27017: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 65defec2d0bbced38d10c446, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [5]:
import os
import yaml
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['githubbase2']  # Replace with your MongoDB database name

def store_yaml_files_in_mongodb(directory_path):
    for root, dirs, files in os.walk(directory_path):
        for file_name in files:
            if file_name.endswith(".yaml"):
                file_path = os.path.join(root, file_name)

                print(f"Storing YAML file in MongoDB: {file_path}")

                with open(file_path, 'r', encoding='utf-8') as file:
                    try:
                        yaml_data = yaml.safe_load(file)

                        # Extract data type from the file path (assumes subdirectories represent data types)
                        data_type = os.path.relpath(root, directory_path)

                        # Create a collection for the data type if it doesn't exist
                        collection_name = f'{data_type}_collection'
                        collection = db[collection_name]

                        # Insert the YAML data into the corresponding collection
                        collection.insert_one(yaml_data)

                        print(f"YAML data stored in MongoDB collection {collection_name}: {file_path}")

                    except yaml.YAMLError as e:
                        print(f"Error parsing YAML file {file_path}: {e}")

# Path to the directory where YAML files are organized
organized_directory = 'files_parsed_by_nahed'

# Store YAML files in MongoDB
store_yaml_files_in_mongodb(organized_directory)

# Close MongoDB connection
client.close()


Storing YAML file in MongoDB: files_parsed_by_nahed\AADSignInEventsBeta\EncodedDomainURL [Nobelium].yaml
YAML data stored in MongoDB collection AADSignInEventsBeta_collection: files_parsed_by_nahed\AADSignInEventsBeta\EncodedDomainURL [Nobelium].yaml
Storing YAML file in MongoDB: files_parsed_by_nahed\AADSignInEventsBeta\riskySignInToDeviceRegistration.yaml
YAML data stored in MongoDB collection AADSignInEventsBeta_collection: files_parsed_by_nahed\AADSignInEventsBeta\riskySignInToDeviceRegistration.yaml
Storing YAML file in MongoDB: files_parsed_by_nahed\AADSignInEventsBeta\riskySignInToElevateAccess.yaml
YAML data stored in MongoDB collection AADSignInEventsBeta_collection: files_parsed_by_nahed\AADSignInEventsBeta\riskySignInToElevateAccess.yaml
Storing YAML file in MongoDB: files_parsed_by_nahed\AADSignInEventsBeta\riskySignInToNewMFAMethod.yaml
YAML data stored in MongoDB collection AADSignInEventsBeta_collection: files_parsed_by_nahed\AADSignInEventsBeta\riskySignInToNewMFAMethod

In [13]:
!pip install requests beautifulsoup4




In [16]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.1-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy<2,>=1.26.0 (from pandas)
  Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ------------------- ------------------ 30.7/61.0 kB 435.7 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.0 kB 650.2 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.0 kB 650.2 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.0 kB 650.2 kB/s eta 0:00:01
     -------------------------------------- 61.0/61.0 kB 232.0 kB/s eta 0:00:00
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.1-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? 

In [18]:
!pip install requests



In [20]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.18.1-py3-none-any.whl.metadata (6.9 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.24.0-py3-none-any.whl.metadata (4.9 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Downloading selenium-4.18.1-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/10.0 MB 1.7 MB/s

In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL to scrape
url = "https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-schema-tables?view=o365-worldwide"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all tables on the page
    tables = soup.find_all('table')

    # Initialize a list to store all table data
    all_table_data = []

    # Iterate through each table and extract schema information
    for table in tables:
        # Extract column headers (schema)
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]

        # Extract data from each row in the body
        rows = table.find('tbody').find_all('tr')

        # Initialize a list to store table data
        table_data = []

        for row in rows:
            # Extract columns from each row
            columns = row.find_all('td')

            # Extract data from each column
            column_data = [td.text.strip() for td in columns]

            # Create a dictionary representing a row of data
            table_row = {}
            for header, data in zip(headers, column_data):
                table_row[header] = data

            # Append the row to the list
            table_data.append(table_row)

        # Add table data to the main list
        all_table_data.extend(table_data)

    # Create a DataFrame from the collected data
    df = pd.DataFrame(all_table_data)

    # Display the DataFrame
    print(df)
else:
    print(f"Failed to retrieve content. Status code: {response.status_code}")


                                  Table name  \
0                        AADSignInEventsBeta   
1                     AADSpnSignInEventsBeta   
2                              AlertEvidence   
3                                  AlertInfo   
4                           BehaviorEntities   
5                               BehaviorInfo   
6                             CloudAppEvents   
7                               DeviceEvents   
8                  DeviceFileCertificateInfo   
9                           DeviceFileEvents   
10                     DeviceImageLoadEvents   
11                                DeviceInfo   
12                         DeviceLogonEvents   
13                       DeviceNetworkEvents   
14                         DeviceNetworkInfo   
15                       DeviceProcessEvents   
16                      DeviceRegistryEvents   
17                 DeviceTvmHardwareFirmware   
18                    DeviceTvmInfoGathering   
19                  DeviceTvmInfoGatheri

In [25]:
import requests
from bs4 import BeautifulSoup

def extract_table_info(table_url):
    response = requests.get(table_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table name
        table_name = soup.find('h1').text.strip()

        # Find the table on the page
        table = soup.find('table')

        # Extract column headers (schema)
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]

        # Extract data from each row in the body
        rows = table.find('tbody').find_all('tr')

        # Initialize a list to store table data
        table_data = []

        for row in rows:
            # Extract columns from each row
            columns = row.find_all('td')

            # Extract data from each column
            column_data = [td.text.strip() for td in columns]

            # Create a dictionary representing a row of data
            row_data = {'Table name': table_name}
            for header, data in zip(headers, column_data):
                row_data[header] = data

            # Append the row to the list
            table_data.append(row_data)

        return table_data
    else:
        print(f"Failed to retrieve content from {table_url}. Status code: {response.status_code}")
        return None

main_url = "https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-schema-tables?view=o365-worldwide"
response = requests.get(main_url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    table_links = soup.find_all('a', class_='hx_short-link')

    # Initialize lists to store all table data
    all_table_names = []
    all_column_names = []
    all_data_types = []
    all_descriptions = []

    for table_link in table_links:
        table_url = table_link['href']
        table_data = extract_table_info(table_url)

        if table_data:
            for row in table_data:
                all_table_names.append(row['Table name'])
                all_column_names.append(row.get('Column name', ''))
                all_data_types.append(row.get('Data type', ''))
                all_descriptions.append(row.get('Description', ''))

    # Print the collected data
    for table_name, column_name, data_type, description in zip(all_table_names, all_column_names, all_data_types, all_descriptions):
        print(f"Table name: {table_name}, Column name: {column_name}, Data type: {data_type}, Description: {description}")

else:
    print(f"Failed to retrieve content. Status code: {response.status_code}")


In [26]:
import requests
from bs4 import BeautifulSoup

def extract_table_content(table_url):
    response = requests.get(table_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table on the page
        table = soup.find('table')

        # Extract column headers (schema)
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]

        # Extract data from each row in the body
        rows = table.find('tbody').find_all('tr')

        # Initialize a list to store table data
        table_data = []

        for row in rows:
            # Extract columns from each row
            columns = row.find_all('td')

            # Extract data from each column
            column_data = [td.text.strip() for td in columns]

            # Create a dictionary representing a row of data
            row_data = {header: data for header, data in zip(headers, column_data)}

            # Append the row to the list
            table_data.append(row_data)

        return table_data
    else:
        print(f"Failed to retrieve content from {table_url}. Status code: {response.status_code}")
        return None

# List of URLs for the 35 tables
table_urls = [
    "https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadsignineventsbeta-table?view=o365-worldwide",
    "https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadspnsignineventsbeta-table?view=o365-worldwide",
    # ... add other URLs here
]

# Iterate through the list of URLs and extract content for each table
for table_url in table_urls:
    table_content = extract_table_content(table_url)

    # Print or store the collected data as needed
    if table_content:
        print(f"Content for {table_url}:")
        for row in table_content:
            print(row)
        print("\n")


Content for https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadsignineventsbeta-table?view=o365-worldwide:
{'Column name': 'Timestamp', 'Data type': 'datetime', 'Description': 'Date and time when the record was generated'}
{'Column name': 'Application', 'Data type': 'string', 'Description': 'Application that performed the recorded action'}
{'Column name': 'ApplicationId', 'Data type': 'string', 'Description': 'Unique identifier for the application'}
{'Column name': 'LogonType', 'Data type': 'string', 'Description': 'Type of logon session, specifically interactive, remote interactive (RDP), network, batch, and service'}
{'Column name': 'ErrorCode', 'Data type': 'int', 'Description': 'Contains the error code if a sign-in error occurs. To find a description of a specific error code, visit https://aka.ms/AADsigninsErrorCodes.'}
{'Column name': 'CorrelationId', 'Data type': 'string', 'Description': 'Unique identifier of the sign-in event'}
{'Column name': 

In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_table_content(table_url):
    response = requests.get(table_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table on the page
        table = soup.find('table')

        # Extract column headers (schema)
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]

        # Extract data from each row in the body
        rows = table.find('tbody').find_all('tr')

        # Initialize a list to store table data
        table_data = []

        for row in rows:
            # Extract columns from each row
            columns = row.find_all('td')

            # Extract data from each column
            column_data = [td.text.strip() for td in columns]

            # Create a dictionary representing a row of data
            row_data = {header: data for header, data in zip(headers, column_data)}

            # Append the row to the list
            table_data.append(row_data)

        # Convert the list of dictionaries to a Pandas DataFrame
        df = pd.DataFrame(table_data)
        return df
    else:
        print(f"Failed to retrieve content from {table_url}. Status code: {response.status_code}")
        return None

# List of URLs for the 35 tables
table_urls = [
    "https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadsignineventsbeta-table?view=o365-worldwide",
    "https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadspnsignineventsbeta-table?view=o365-worldwide",
    "learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadspnsignineventsbeta-table",
]

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Iterate through the list of URLs and extract content for each table
for table_url in table_urls:
    table_content = extract_table_content(table_url)

    # Print or store the collected data as needed
    if table_content is not None:
        print(f"Content for {table_url}:")
        print(table_content)
        print("\n")

        # Concatenate the current table's data to the combined DataFrame
        combined_df = pd.concat([combined_df, table_content], ignore_index=True)

# Print the combined DataFrame
print("Combined DataFrame:")
print(combined_df)


Content for https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadsignineventsbeta-table?view=o365-worldwide:
                        Column name Data type  \
0                         Timestamp  datetime   
1                       Application    string   
2                     ApplicationId    string   
3                         LogonType    string   
4                         ErrorCode       int   
5                     CorrelationId    string   
6                         SessionId    string   
7                AccountDisplayName    string   
8                   AccountObjectId    string   
9                        AccountUpn    string   
10                   IsExternalUser       int   
11                      IsGuestUser   boolean   
12              AlternateSignInName    string   
13      LastPasswordChangeTimestamp  datetime   
14              ResourceDisplayName    string   
15                       ResourceId    string   
16                 Resour

MissingSchema: Invalid URL 'learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadspnsignineventsbeta-table': No scheme supplied. Perhaps you meant https://learn.microsoft.com/en-us/microsoft-365/security/defender/advanced-hunting-aadspnsignineventsbeta-table?