In [66]:
import os
import re
import glob
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer  # Import CountVectorizer

In [28]:
base_directory="../Data/docker_files"

## Extract templates

In [29]:
# Function to get all Dockerfiles
def get_dockerfiles(base_path):
    dockerfiles = []
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.startswith("Dockerfile"):  # Adjust based on your naming pattern
                dockerfiles.append(os.path.join(root, file))
    return dockerfiles

In [60]:
def custom_tokenizer(text):
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)  # Split camelCase 
    text = re.sub(r"([a-zA-Z])([0-9])", r"\1 \2", text)  # Split alphanumeric 
    text = re.sub(r"[-_/.=:\\]", " ", text)  # Split on hyphens and underscores 
    tokens = re.findall(r"[a-zA-Z0-9]+", text)  # Keep alphanumeric tokens
    return tokens

In [73]:
print(custom_tokenizer("/usr/share/zoneinfo/Asia/Shanghai"))

['usr', 'share', 'zoneinfo', 'Asia', 'Shanghai']


In [61]:
# Function to read Dockerfile contents
def read_dockerfiles(file_paths):
    contents = []
    for path in file_paths:
        with open(path, "r", encoding="utf-8") as f:
            contents.append(f.read())
    return contents

In [62]:
# Collect Dockerfiles
dockerfile_paths = get_dockerfiles(base_directory)
print(f"Found {len(dockerfile_paths)} Dockerfiles")
# Read their contents
dockerfile_texts = read_dockerfiles(dockerfile_paths)

Found 1807 Dockerfiles


In [63]:
vectorizer_model = CountVectorizer(ngram_range=(2, 10),tokenizer=custom_tokenizer)
topic_model = BERTopic(vectorizer_model=vectorizer_model)
#topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probabilities = topic_model.fit_transform(dockerfile_texts)
for topic_id, topic in topic_model.get_topics().items():
    print(f"Topic {topic_id}: {topic}")

Topic -1: [('run aptget', 0.0019360824298264962), ('mkdir p', 0.0015258452530712814), ('run mkdir', 0.0014469084735200968), ('aptget install', 0.0014214354922665042), ('run apk', 0.0013306687163630274), ('install y', 0.0013263301759591144), ('from openjdk', 0.001280831079716349), ('run npm', 0.0012402657564220597), ('aptget update', 0.0012236297010962238), ('apk add', 0.0012069867227495622)]
Topic 0: [('run go', 0.008427489021484196), ('from golang', 0.006485604752909112), ('go build', 0.005636391065929736), ('go install', 0.004376480970262609), ('go mod', 0.0036917114407994684), ('run apk', 0.003567436214856117), ('run go mod', 0.003416643924556851), ('cgoenabled 0', 0.003416643924556851), ('run cgoenabled', 0.003136997356073445), ('run cgoenabled 0', 0.003042687529289493)]
Topic 1: [('pip install', 0.004809965508641273), ('run pip', 0.004475499993203307), ('run aptget', 0.0035936131669864423), ('install y', 0.0035581271686520506), ('aptget install', 0.0035508802584644956), ('from pyt

## store topics in knowlage base 

In [71]:
def extract_topics_to_json(topic_model, output_file="Results/knowloage_base.json"):
 
    # Get topic information
    topic_info = topic_model.get_topic_info()

    # Get representative documents for each topic
    representative_docs = topic_model.get_representative_docs()

    # Create a dictionary to store topics
    topics_dict = {}

    for topic_id in topic_info["Topic"].unique():
        if topic_id == -1:  # Skip the outlier topic
            continue

        # Get keywords for the topic
        keywords = [word for word, _ in topic_model.get_topic(topic_id)]

        # Get representative documents for the topic
        docs = representative_docs[topic_id]

        # Store topic information in the dictionary
        topics_dict[str(topic_id)] = {
            "keywords": keywords,
            "representative_documents": docs        }

    # Save the dictionary to a JSON file
    with open(output_file, "w") as f:
        json.dump(topics_dict, f, indent=4)

    print(f"Topics saved to {output_file}")
    

In [72]:
# Extract topics and save to JSON
extract_topics_to_json(topic_model)    

Topics saved to knowloage_base.json
