In [2057]:
## Install
#!pip install openai
#!pip install rank-bm25
#!pip install faiss-cpu  
#!pip uninstall numpy -y
#!pip show numpy
#!pip install setuptools wheel
#!pip install --upgrade importlib-metadata
#!pip install pandas
#!pip install sentence-transformers
#!pip install import_ipynb

In [2159]:
import openai
import json
import requests
import pandas as pd
import os
import base64
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import xml.etree.ElementTree as ET
from collections import Counter


In [2161]:
# ENter your OpenAI api key
client = openai.OpenAI(api_key='')
json_file_path = "files/dockerfile_generation_metadata.json"  # Path to JSON file
knowledge_base_path="Results/knowloage_base.json"
query_file_path="files/query_template.txt"
prompt_template_with_topic_path = "files/dockerfile_prompt_topics.txt"
prompt_template_without_topic_path = "files/dockerfile_prompt.txt"
# GitHub Personal Access Token
GITHUB_TOKEN = ""
# Path to the CSV file with GitHub URLs
csv_file = 'files/dataSet.csv'  

In [2163]:

# GitHub API base URL
GITHUB_API_URL = "https://api.github.com"
# Headers for authentication
headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}


## Step 2.1

## Retreive metadata

In [2167]:
import pandas as pd

def read_github_urls(csv_file, start_line=0, end_line=None):
    """
    Reads GitHub URLs from a CSV file within a specified range of lines.

    Parameters:
    - csv_file: Path to the CSV file.
    - start_line: The starting line (0-based index) to begin reading from.
    - end_line: The ending line (0-based index) to stop reading at (exclusive).

    Returns:
    - A list of URLs within the specified range.
    """
    # Calculate the number of rows to skip and the number of rows to read
    skip_rows = start_line
    num_rows = end_line - start_line if end_line is not None else None

    # Read the CSV file
    df = pd.read_csv(csv_file, sep=';', skiprows=range(1, skip_rows + 1), nrows=num_rows)

    return df["URL"].tolist()

In [2169]:
def generate_excel_report(projects):
    data = []
    for project_name, microservices in projects.items():
        repo_owner, repo_name = project_name.split("/")
        num_dockerfiles = sum(1 for ms in microservices if fetch_file_content(repo_owner, repo_name, f"{ms}/Dockerfile"))
        data.append({"Project": project_name, "Number of Dockerfiles": num_dockerfiles})
    
    df = pd.DataFrame(data)
    df.to_excel("dockerfile_report.xlsx", index=False)
    print("Excel report generated: dockerfile_report.xlsx")

In [2171]:
headers = {'Authorization': f'token {GITHUB_TOKEN}'}

def get_default_branch(owner, repo):
    """
    Fetches the default branch of a repository from the GitHub API.

    Args:
        owner (str): The owner of the repository.
        repo (str): The name of the repository.

    Returns:
        str: The default branch of the repository, or 'main' as a fallback.
    """
    api_url = f'https://api.github.com/repos/{owner}/{repo}'
    print(f"in default branch {api_url}")
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        repo_data = response.json()
        return repo_data.get('default_branch', 'master')  # Return default branch or fallback to 'main'
    except requests.exceptions.RequestException as e:
        print(f"Error fetching default branch for {owner}/{repo}: {e}")
        return 'master'  # Fallback to 'main' if there's an error

In [2173]:
def identify_microservices(repo_url):
    """
    Identify microservices in a GitHub repository based on the presence of Dockerfiles,
    excluding Dockerfiles in folders named 'test', 'tests', '_test', or '_tests'.

    Args:
        repo_url (str): The GitHub repository URL (e.g., https://github.com/claranet/spryker-demoshop).

    Returns:
        list: A list of microservice paths (parent folders of Dockerfiles, excluding test folders).
    """
    # Extract owner and repo from the GitHub repository URL
    repo_parts = repo_url.rstrip('/').split('/')
    if len(repo_parts) < 2:
        print(f"Invalid repository URL: {repo_url}")
        return []  # Return an empty list if the URL is invalid

    owner, repo = repo_parts[-2], repo_parts[-1]

    # Get the default branch of the repository
    default_branch = get_default_branch(owner, repo)
    if not default_branch:
        return []  # Return an empty list if the default branch cannot be fetched

    # Construct the GitHub API URL to list all files in the repository recursively
    tree_url = f'https://api.github.com/repos/{owner}/{repo}/git/trees/{default_branch}?recursive=1'
    print(f"tree_url {tree_url}")
    try:
        response = requests.get(tree_url, headers=headers)
        response.raise_for_status()
        repo_tree = response.json()

        microservices = set()  # Use a set to avoid duplicate microservices

        # Define test folder names to exclude
        test_folder_names = {"test", "tests", "_test", "_tests"}

        # Look for Dockerfiles in the repository tree
        for file_data in repo_tree['tree']:
            if file_data['type'] == 'blob' and file_data['path'].lower().endswith('dockerfile'):
                # Get the parent folder of the Dockerfile
                microservice_path = '/'.join(file_data['path'].split('/')[:-1])
                
                # Exclude Dockerfiles in test folders
                if not any(test_folder in microservice_path.lower().split('/') for test_folder in test_folder_names):
                    microservices.add(microservice_path)

        return list(microservices)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching repository tree for {tree_url}: {e}")
        return []  # Return an empty list in case of failure

In [2175]:
def create_project_structure(project_name):
    project_folder = os.path.join("projects", project_name)
    os.makedirs(project_folder, exist_ok=True)
    return project_folder

In [2177]:
def extract_repo_info(url):
    # Remove "https://" and split the URL
    parts = url.replace("https://", "").split("/")
    if len(parts) >= 2:
        return parts[0], parts[1]
    else:
        return None, None

In [2179]:
def check_up_to_root(project_structure, microservice_path, file_name):
    """
    Checks for a file starting from the microservice directory up to the root.

    Args:
        project_structure (list): The list of files and directories in the repository.
        microservice_path (str): The starting path (microservice directory).
        file_name (str): The name of the file to look for (e.g., "pom.xml").

    Returns:
        bool: True if the file is found, False otherwise.
    """
    # Normalize the microservice path (ensure it ends with a slash)
    microservice_path = microservice_path.rstrip('/') + '/'
    
    # Start from the microservice directory and move up to the root
    current_path = microservice_path
    while True:
        # Check if the file exists in the current directory
        for item in project_structure:
            if item['path'] == f"{current_path.rstrip('/')}/{file_name}" or item['path'] == file_name:
                return item['path']
        
        # Move up to the parent directory
        if current_path == '' or current_path == '/':
            break  # Reached the root, stop searching
        
        # Update current_path to the parent directory
        current_path = '/'.join(current_path.split('/')[:-2]) + '/' if len(current_path.split('/')) > 1 else ''
    
    return False

In [2181]:
def fetch_project_structure(repo_owner, repo_name):
    """
    Fetches the structure of the entire repository, returning only path and type.

    Args:
        repo_owner (str): The owner of the repository.
        repo_name (str): The name of the repository.

    Returns:
        list: A list of dictionaries with 'path' and 'type' (file or directory).
    """
    default_branch = get_default_branch(repo_owner, repo_name)
    tree_url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/git/trees/{default_branch}?recursive=1'

    try:
        response = requests.get(tree_url, headers=headers)
        response.raise_for_status()
        tree_data = response.json()

        # Extract only 'path' and 'type' from the API response
        filtered_tree = [{"path": item["path"], "type": item["type"]} for item in tree_data.get("tree", [])]

        return filtered_tree

    except requests.exceptions.RequestException as e:
        print(f"Error fetching project structure: {e}")
        return []


In [2183]:
def fetch_microservice_structure(project_structure, microservice_path):
    """
    Fetches the structure of a microservice from the project structure, returning only first-level files and folders.

    Args:
        project_structure (list): The list of files and directories in the repository.
        microservice_path (str): The path to the microservice folder (can be root or nested).

    Returns:
        dict: A dictionary with first-level files and folders.
    """

    # Ensure no trailing slash
    microservice_path = microservice_path.rstrip('/')

    # Initialize the structure
    structure = {"files": [], "folders": []}

    for item in project_structure:
        # Check if the item is within the microservice path
        if microservice_path == "" or item["path"].startswith(microservice_path + "/") or item["path"] == microservice_path:
            # Get the relative path (if in root, use directly)
            relative_path = item["path"] if microservice_path == "" else item["path"][len(microservice_path) + 1:]

            # Skip if it's the microservice_path itself
            if not relative_path:
                continue

            # Split the relative path into parts
            path_parts = relative_path.split('/')

            # Only consider the first level
            if len(path_parts) == 1:
                if item["type"] == "blob":  # File
                    structure["files"].append(path_parts[0])
                elif item["type"] == "tree":  # Folder
                    structure["folders"].append(path_parts[0])

    return structure

In [2185]:
# Define language and build system mappings
language_build_mappings = [
    # Java
    {"language": "Java", "build_system": "Maven", "files": ["pom.xml"]},
    {"language": "Java", "build_system": "Gradle", "files": ["build.gradle", "build.gradle.kts"]},
    
    # Python
    {"language": "Python", "build_system": "pip", "files": ["requirements.txt"]},
    {"language": "Python", "build_system": "poetry", "files": ["pyproject.toml"]},
    {"language": "Python", "build_system": "setuptools", "files": ["setup.py"]},
    {"language": "Python", "build_system": "pip", "files": [".pylintrc", ".pylint-conf"]},
    
    # JavaScript
    {"language": "JavaScript", "build_system": "npm", "files": ["package.json"]},
    {"language": "JavaScript", "build_system": "yarn", "files": ["yarn.lock"]},
    
    # Go
    {"language": "Go", "build_system": "go mod", "files": ["go.mod", "go.sum"]},
    
    # Rust
    {"language": "Rust", "build_system": "Cargo", "files": ["Cargo.toml", "Cargo.lock"]},
    
    # Ruby
    {"language": "Ruby", "build_system": "Bundler", "files": ["Gemfile", "Gemfile.lock"]},
    
    # Elixir
    {"language": "Elixir", "build_system": "Mix", "files": ["mix.exs", "mix.lock"]},
    
    # Nix (language-agnostic)
    {"language": "Nix", "build_system": "Nix", "files": ["flake.nix", "flake.lock"]},

    #Scala
    {"language": "Scala", "build_system": "sbt", "files": ["build.sbt", "project/build.properties", "project/plugins.sbt", "project/plugins.sbt","Dependencies.scala"]},

    # Perl
    {"language": "Perl", "build_system": "Makefile.PL", "files": ["Makefile.PL"]},
    {"language": "Perl", "build_system": "Build.PL", "files": ["Build.PL"]},
    {"language": "Perl", "build_system": "cpanfile", "files": ["cpanfile"]},

    # C#
    {"language": "C#", "build_system": "MSBuild", "files": [".csproj"]},
    {"language": "C#", "build_system": "MSBuild", "files": [".sln"]},
    {"language": "C#", "build_system": "dotnet", "files": ["global.json"]},
    {"language": "C#", "build_system": "Paket", "files": ["paket.dependencies"]},

    # PHP
    {"language": "PHP", "build_system": "Composer", "files": ["composer.json", "composer.lock","wp-config.php"]},
    
        
]

In [2187]:
def fetch_file_content(repo_owner, repo_name, file_path):
    """
    Fetches the content of a file from a GitHub repository.

    Args:
        repo_owner (str): The owner of the repository.
        repo_name (str): The name of the repository.
        file_path (str): The path to the file in the repository.

    Returns:
        str: The content of the file, or None if the file is not found or an error occurs.
    """
    api_url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}'
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        file_info = response.json()
        
        # Decode the file content if it's base64 encoded
        if 'content' in file_info:
            return base64.b64decode(file_info['content']).decode('utf-8')
        else:
            print(f"No content found for file: {file_path}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Warning: File not found at {api_url}: {e}")
        return None

In [2189]:
def detect_language_and_build_system(repo_owner, repo_name, microservice_path, project_structure):
    """
    Detects the language and build system of a microservice by analyzing its structure.

    Args:
        repo_owner (str): The owner of the repository.
        repo_name (str): The name of the repository.
        microservice_path (str): The path to the microservice folder.
        project_structure (list): The list of files and directories in the repository.

    Returns:
        dict: A dictionary containing the detected language, build system and dependencies.
    """
    info = {"language": None, "build_system": None,  "dependencies": {}}
    
    # Iterate through mappings to detect language and build system
    for mapping in language_build_mappings:
        for file in mapping["files"]:
            file_path_conf = check_up_to_root(project_structure, microservice_path, file)
            if file_path_conf:
                print(file_path_conf)
                # Assign detected language and build system
                info["language"] = mapping["language"]
                info["build_system"] = mapping["build_system"]

                # Immediately fetch dependency file content
                file_content = fetch_file_content(repo_owner, repo_name, file_path_conf)
                if file_content:  # Store if content is found
                    info["dependencies"][file] = file_content

                return info  # Return as soon as a match is found    
    
    return info

In [2191]:
def fetch_files_with_extension(repo_owner, repo_name, microservice_path, extension):
    files = []
    tree_url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/git/trees/main?recursive=1'
    response = requests.get(tree_url, headers=headers)
    if response.status_code == 200:
        repo_tree = response.json()
        for file_data in repo_tree['tree']:
            if file_data['type'] == 'blob' and file_data['path'].endswith(extension):
                file_content = fetch_file_content(repo_owner, repo_name, file_data['path'])
                if file_content:
                    files.append(file_content)
    return files

In [2193]:
def detect_framework(language, dependencies):
    """
    Detects the framework used in a microservice based on its language and dependency file contents.

    Args:
        language (str): The detected programming language.
        dependencies (dict): A dictionary where keys are file names and values are file contents.

    Returns:
        str: The detected framework name, or None if no framework is found.
    """
    
    # Framework mappings with file-based keyword detection
    framework_mappings = {
        "Python": {
            "Flask": ["flask"],
            "Django": ["django"],
            "FastAPI": ["fastapi"],
            "Tornado": ["tornado"],
            "Rasa": ["rasa-sdk"],  # Added Rasa framework
            "Sanic": ["sanic"],
        },
        "JavaScript": {
            "Node.js": ["node"],
            "Express.js": ["express"],
            "Apollo GraphQL": ["@apollo/client", "@apollo/server", "graphql"],
            "NestJS": ["@nestjs"],
            "Next.js": ["next"],
            "Nuxt.js": ["nuxt"],
            "Passport.js": ["passport", "passport-auth0", "passport-github"],
            "TaskCluster": ["taskcluster-lib-api", "taskcluster-lib-app", "taskcluster-lib-config"],
            "Jasmine": ["jasmine-core"],
            "Karma": ["karma"],
            "Selenium WebDriver": ["selenium-webdriver"],
        },
        "Java": {
            "Spring Boot": ["spring-boot", "org.springframework"],
            "Quarkus": ["quarkus"],
            "Micronaut": ["micronaut"],
        },
        "Scala": {
            "Play Framework": ["play", "playframework", "com.typesafe.play"],
            "Akka": ["akka-http", "akka-stream", "com.typesafe.akka"],
            "Lagom": ["com.lightbend.lagom"],
            "Finatra": ["com.twitter.finatra"],
            "Finagle": ["com.twitter.finagle"],
            "ZIO": ["dev.zio", "zio-http"],
        },
        "Perl": {
            "Mojolicious": ["mojolicious"],
            "Dancer": ["dancer"],
            "Dancer2": ["dancer2"],
            "Catalyst": ["catalyst", "catalyst::runtime"],
            "CGI::Application": ["cgi::application"],
            "Plack": ["plack", "plack::middleware", "plack::request"],
        },
        "C#": {
            "ASP.NET": ["aspnetcore", "asp.net", "dotnet-aspnetcore"],
        },
        "PHP": {
            "Laravel": ["laravel"],
            "Symfony": ["symfony"],
            "CodeIgniter": ["codeigniter"],
        },
        "Ruby": {
            "Rails": ["rails"],
            "Sinatra": ["sinatra"],
        },
        "Go": {
            "Gin": ["github.com/gin-gonic/gin"],
            "Echo": ["github.com/labstack/echo"],
        },
        "Rust": {
            "Actix": ["actix-web"],
            "Rocket": ["rocket"],
        }
    }

    # Check if language is in mappings
    if language not in framework_mappings:
        return None  # No frameworks defined for this language

    # Read the dependency files and check for framework keywords
    for file_name, content in dependencies.items():
        content_lower = content.lower()
        
        # Special handling for Perl (`cpanfile`, `Makefile.PL`, `Build.PL`, `META.json`)
        if file_name in ["cpanfile", "Makefile.PL", "Build.PL", "META.json"]:
            for framework, keywords in framework_mappings.get("Perl", {}).items():
                if any(keyword in content_lower for keyword in keywords):
                    return framework  # Return the first matching framework

        # Normal dependency scanning for all languages
        for framework, keywords in framework_mappings[language].items():
            if any(keyword in content_lower for keyword in keywords):
                return framework  # Return the first matching framework

    return language  # No framework detected


In [2195]:
def detect_database(dependencies):
    """
    Detects the database used in a microservice by analyzing its dependencies.

    Args:
        language (str): The detected programming language.
        dependencies (dict): A dictionary where keys are file names and values are file contents.

    Returns:
        str: The detected database name, or None if no database is found.
    """
    
    # Database keyword mappings for different languages
    database_mappings = {
        "PostgreSQL": ["postgres", "pg", "pg-promise", "psycopg2", "pg-connection-string", "pgbouncer"],
        "MySQL": ["mysql", "mysql-connector", "mysql2"],
        "SQLite": ["sqlite", "sqlite3", "aiosqlite"],
        "MongoDB": ["mongodb", "mongoose", "pymongo"],
        "Redis": ["redis", "aioredis"],
        "Cassandra": ["cassandra-driver", "datastax"],
        "Elasticsearch": ["elasticsearch", "opensearch"],
        "DynamoDB": ["dynamodb", "boto3.dynamodb"],
        "CouchDB": ["couchdb"],
        "Neo4j": ["neo4j", "neo4j-driver"],
        "Oracle": ["cx_oracle", "oracle"],
        "MS SQL Server": ["mssql", "pyodbc", "pymssql"],
        "MariaDB": ["mariadb"],
        "Firebase": ["firebase", "firestore"],
        "DuckDB": ["duckdb"],
    }

    # Check dependency files for database-related keywords
    for file_name, content in dependencies.items():
        content_lower = content.lower()
        
        for db_name, keywords in database_mappings.items():
            if any(keyword in content_lower for keyword in keywords):
                return db_name  # Return the first matching database

    return None  # No database detected


In [2197]:
def detect_config_files(repo_owner, repo_name, microservice_path, project_structure):
    """
    Detects configuration files used in a microservice and saves their content.

    Args:
        repo_owner (str): The owner of the repository.
        repo_name (str): The name of the repository.
        microservice_path (str): The path to the microservice folder.
        project_structure (list): The list of files and directories in the repository.

    Returns:
        dict: A dictionary containing detected config files and their content.
    """

    # Expanded list of common configuration files
    config_files = [
        # Environment files
        ".env", ".env.local", ".env.production", ".env.development", ".env.test",
        
        # JSON/YAML Configuration files
        "config.json", "config.yaml", "config.yml", "settings.json", "settings.yml", "secrets.json",
        
        # Application framework configurations
        "application.yml", "application.yaml", "application.properties", "appsettings.json",
        "log4j.xml", "log4j2.xml", "logback.xml", "hibernate.cfg.xml",
        
        # Docker & Kubernetes configurations
        "docker-compose.yml", "docker-compose.override.yml", "Dockerfile",
        "kustomization.yaml", "k8s-deployment.yaml", "k8s-service.yaml", "kubernetes.yml",
        
        # CI/CD configurations
        ".github/workflows/*.yml", ".gitlab-ci.yml", "Jenkinsfile", ".circleci/config.yml",
        
        # Terraform & Infrastructure as Code
        "terraform.tf", "terraform.tfvars", ".terraformrc",
        
        # Cloud provider configurations
        "aws-exports.json", "gcp-config.json", "azure-pipelines.yml",
        
        # General config formats
        "config.toml", "config.ini", "config.properties", "config.xml",
        
        # Authentication & secrets
        ".secrets", "secrets.yml", "vault.yml", "service-account.json",
        
        # Nginx & Apache
        "nginx.conf", "httpd.conf", "apache2.conf"
    ]

    detected_configs = {}

    # Iterate through project structure to find config files
    for config_file in config_files:
        detected_file_path = check_up_to_root(project_structure, microservice_path, config_file)
        if detected_file_path:
            file_content = fetch_file_content(repo_owner, repo_name, detected_file_path)
            if file_content:
                detected_configs[detected_file_path] = file_content  # Store content

    return detected_configs  # Return dictionary with config files and their content

In [2199]:
def detect_static_files(repo_owner, repo_name, microservice_path, project_structure):
    """
    Detects static files in a microservice under common static directories while ignoring Docker and Kubernetes files.

    Args:
        repo_owner (str): The owner of the repository.
        repo_name (str): The name of the repository.
        microservice_path (str): The path to the microservice folder.
        project_structure (list): The list of files and directories in the repository.

    Returns:
        dict: A dictionary containing detected static files and their content.
    """

    # Common directories for static files
    static_dirs = ["static", "public", "resources", "assets", "media", "content", "dist", "build"]

    # Common static file extensions
    static_extensions = [".less", ".json", ".xml", ".md", ".me"]

    # Ignore Docker & Kubernetes-related files
    ignored_files = [
        "Dockerfile", "docker-compose.yml", ".dockerignore",
        "kustomization.yaml", "k8s.yaml", "kubernetes.yml",
        "k8s-deployment.yaml", "k8s-service.yaml", "k8s-ingress.yaml",
        "helm-chart/", "helm/", "charts/", "values.yaml"
    ]

    detected_static_files = {}

    # Iterate through static directories and check up to the root
    for static_dir in static_dirs:
        detected_static_dir = check_up_to_root(project_structure, microservice_path, static_dir)
        if detected_static_dir:
            # Find all files in the detected static directory
            for item in project_structure:
                file_path = item["path"]

                # Skip ignored Docker & Kubernetes files
                if any(file_path.endswith(ignore) or file_path.startswith(ignore) for ignore in ignored_files):
                    continue

                # Detect static files by extension
                if file_path.startswith(detected_static_dir) and any(file_path.endswith(ext) for ext in static_extensions):
                    file_content = fetch_file_content(repo_owner, repo_name, file_path)
                    if file_content:
                        detected_static_files[file_path] = file_content  # Store content

    return detected_static_files  # Return dictionary with static files and their content


In [2217]:
def extract_source_code_info(repo_owner, repo_name,project_folder,microservice_path,project_structure):
    source_code_info = {
        "language": None,
        "framework": None,
        "dependencies": [],
        "database": None,
        "build_system": None,
        "config_files": [],
        "static_files": None,
        
    }


    # Detect Language and Build System
    source_code_info.update(detect_language_and_build_system(repo_owner, repo_name, microservice_path,project_structure))
     
   # Construct the API URL to fetch the Dockerfile
    dockerfile_api_url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{microservice_path}/Dockerfile'
    microservice_folder = os.path.join(project_folder, microservice_path.replace("/", "_"))
    os.makedirs(microservice_folder, exist_ok=True)
    try:
        # Fetch the Dockerfile metadata
        response = requests.get(dockerfile_api_url, headers=headers)
        response.raise_for_status()
        dockerfile_info = response.json()

        # Get the download URL for the Dockerfile
        download_url = dockerfile_info.get('download_url')
        
           
        if not download_url:
            print(f"No download URL found for Dockerfile in {microservice_path}")
        else:
            # Download the Dockerfile content
            dockerfile_content = requests.get(download_url).text

            # Create a folder for the microservice inside the project folder
             # Save the Dockerfile to the microservice folder
            dockerfile_path = os.path.join(microservice_folder, 'Dockerfile')
            with open(dockerfile_path, 'w') as f:
                f.write(dockerfile_content)

            print(f"Dockerfile downloaded and saved to: {dockerfile_path}")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading Dockerfile from {dockerfile_api_url}: {e}")
    generate_json_file(microservice_folder,source_code_info,project_structure)
    return source_code_info

In [2419]:
import os
import requests

def redownload_dockerfiles(repo_owner, repo_name, project_folder, microservice_path):
    """
    Redownloads Dockerfiles from GitHub and replaces them in existing microservice folders.
    """
    # Construct API URL to fetch Dockerfile
    dockerfile_api_url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{microservice_path}/Dockerfile'
    
    # Define the existing folder path for the microservice
    microservice_folder = os.path.join(project_folder, microservice_path.replace("/", "_"))
    
    # Ensure the folder exists
    if not os.path.exists(microservice_folder):
        print(f"⚠️ Folder {microservice_folder} does not exist. Skipping...")
        return
    
    try:
        # Fetch the Dockerfile metadata
        response = requests.get(dockerfile_api_url, headers=headers)
        response.raise_for_status()
        dockerfile_info = response.json()

        # Get the download URL for the Dockerfile
        download_url = dockerfile_info.get('download_url')

        if not download_url:
            print(f"⚠️ No download URL found for Dockerfile in {microservice_path}. Skipping...")
            return

        # Download the new Dockerfile content
        dockerfile_content = requests.get(download_url).text

        # Define the path to save the new Dockerfile
        dockerfile_path = os.path.join(microservice_folder, 'Dockerfile')

        # Replace the existing Dockerfile
        with open(dockerfile_path, 'w', encoding='utf-8') as f:
            f.write(dockerfile_content)

        print(f"✅ Dockerfile updated in: {dockerfile_path}")

    except requests.exceptions.RequestException as e:
        print(f"❌ Error downloading Dockerfile from {dockerfile_api_url}: {e}")



In [2219]:
def generate_json_file(microservice_folder, source_code_info,project_structure):
    json_data = {
        "source_code_info": source_code_info,
        "project_structure": project_structure
    }
    
    # microservice_folder = os.path.join(project_folder, microservice_name)
    # os.makedirs(microservice_folder, exist_ok=True)
    
    json_file_path = os.path.join(microservice_folder, "extracted_data.json")
    with open(json_file_path, "w") as f:
        json.dump(json_data, f, indent=4)
    
    print(f"JSON file saved at {json_file_path}")

In [2221]:
def generate_excel_report(projects):
    data = []
    for project_name, microservices in projects.items():
        num_dockerfiles = sum(1 for ms in microservices if fetch_file_content(project_name.split("/")[0], project_name.split("/")[1], f"{ms}/Dockerfile"))
        data.append({"Project": project_name, "Number of Dockerfiles": num_dockerfiles})
    
    df = pd.DataFrame(data)
    df.to_excel("dockerfile_report.xlsx", index=False)
    print("Excel report generated: dockerfile_report.xlsx")

In [2421]:
def analyze_repositories(csv_file, start,end):
    # Read GitHub URLs from CSV
    urls = read_github_urls(csv_file, start, end)
    projects_num = 0
    projects = {}

    for url in urls:
        projects_num += 1
        print(f"***************** Project {projects_num} **************")
        repo_parts = url.rstrip('/').split('/')
        
        if len(repo_parts) < 2:
            print(f"Invalid repository URL: {url}")
            continue  # Skip to the next repository
        
        owner, repo = repo_parts[-2], repo_parts[-1]
        api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/"
        print(f"Analyzing repository: {api_url}")

        # Identify microservices
        microservices = identify_microservices(url)
        print(f"Number of detected microservices: {len(microservices)}")

        # Fetch project structure
        project_folder = os.path.join("projects_with_docker", repo)

       
        # First Pass: Extract Source Code Info and Update Counters
        for microservice in microservices:
            # Extract source code info
            redownload_dockerfiles(owner, repo, project_folder, microservice)
           

In [2223]:
def analyze_repositories(csv_file, start,end):
    # Read GitHub URLs from CSV
    urls = read_github_urls(csv_file, start, end)
    projects_num = 0
    projects = {}

    for url in urls:
        projects_num += 1
        print(f"***************** Project {projects_num} **************")
        repo_parts = url.rstrip('/').split('/')
        
        if len(repo_parts) < 2:
            print(f"Invalid repository URL: {url}")
            continue  # Skip to the next repository
        
        owner, repo = repo_parts[-2], repo_parts[-1]
        api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/"
        print(f"Analyzing repository: {api_url}")

        # Identify microservices
        microservices = identify_microservices(url)
        print(f"Number of detected microservices: {len(microservices)}")

        # Fetch project structure
        project_folder = create_project_structure(repo)
        project_structure = fetch_project_structure(owner, repo)

        # Initialize counters and storage
        language_counter = Counter()
        build_system_counter = Counter()
        microservice_configs = {}
        dependencies_by_build_system = defaultdict(dict)  # Store dependencies per build system

        # First Pass: Extract Source Code Info and Update Counters
        for microservice in microservices:
            # Extract source code info
            microservice_structure = fetch_microservice_structure(project_structure, microservice)
            source_code_info = extract_source_code_info(owner, repo, project_folder, microservice, project_structure)
            
            # Store initial language and build system
            language = source_code_info["language"]
            build_system = source_code_info["build_system"]
            dependencies = source_code_info.get("dependencies", {})

            # Store dependencies in a dictionary format per build system
            if build_system and dependencies:
                for file_name, content in dependencies.items():
                    if file_name not in dependencies_by_build_system[build_system]:
                        dependencies_by_build_system[build_system][file_name] = content

            # Update counters for known values
            if language:
                language_counter[language] += 1
            if build_system:
                build_system_counter[build_system] += 1
            
            # Store extracted info for second pass
            microservice_configs[microservice] = source_code_info

        # If no microservices are detected, continue to next project
        if not microservices:
            print(f"No microservices found for {api_url}. Skipping.")
            continue  

        # Check if all microservices lack language and build system
        all_missing = not language_counter and not build_system_counter
        if all_missing:
            print("Error: No language or build system detected for any microservice in this project.")
            continue  # Skip this project but continue with others

        # Determine the Most Common Build System and Corresponding Language
        most_common_build_system = build_system_counter.most_common(1)[0][0] if build_system_counter else None
        most_common_language = None

        if most_common_build_system:
            for mapping in language_build_mappings:
                if mapping["build_system"] == most_common_build_system:
                    most_common_language = mapping["language"]
                    break
    
        # Second Pass: Fill Missing Values
        for microservice, source_code_info in microservice_configs.items():
            if source_code_info["language"] is None and most_common_language:
                source_code_info["language"] = most_common_language
                print(f"Final correction: Assigning most common language '{most_common_language}' to {microservice}")

            if source_code_info["build_system"] is None and most_common_build_system:
                source_code_info["build_system"] = most_common_build_system
                print(f"Final correction: Assigning most common build system '{most_common_build_system}' to {microservice}")

            # If dependencies are missing, assign them from another microservice with the same build system
            build_system = source_code_info.get("build_system")
            if not source_code_info.get("dependencies") and build_system in dependencies_by_build_system:
                source_code_info["dependencies"] = dependencies_by_build_system[build_system]
                print(f"Final correction: Assigning dependencies to {microservice} from another microservice with the same build system.")
            # Re-detect framework after filling dependencies
            source_code_info["framework"] = detect_framework(source_code_info["language"], source_code_info.get("dependencies", {}))
            # Re-detect framework after filling dependencies
            source_code_info["database"] = detect_database(source_code_info.get("dependencies", {}))
            source_code_info["config_files"]=detect_config_files(owner, repo, microservice,project_structure)
            source_code_info["static_files"]=detect_static_files(owner, repo, microservice,project_structure)
        
            microservice_folder = os.path.join(project_folder, microservice.replace("/", "_"))
            generate_json_file(microservice_folder,source_code_info,microservice_structure)
    return True  # Returns structured microservice data

In [2430]:
rate_limit_url = 'https://api.github.com/rate_limit'
response = requests.get(rate_limit_url, headers=headers)
print(response.json())

{'resources': {'core': {'limit': 5000, 'used': 1649, 'remaining': 3351, 'reset': 1741886237}, 'search': {'limit': 30, 'used': 0, 'remaining': 30, 'reset': 1741884524}, 'graphql': {'limit': 5000, 'used': 0, 'remaining': 5000, 'reset': 1741888064}, 'integration_manifest': {'limit': 5000, 'used': 0, 'remaining': 5000, 'reset': 1741888064}, 'source_import': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1741884524}, 'code_scanning_upload': {'limit': 1000, 'used': 0, 'remaining': 1000, 'reset': 1741888064}, 'code_scanning_autofix': {'limit': 10, 'used': 0, 'remaining': 10, 'reset': 1741884524}, 'actions_runner_registration': {'limit': 10000, 'used': 0, 'remaining': 10000, 'reset': 1741888064}, 'scim': {'limit': 15000, 'used': 0, 'remaining': 15000, 'reset': 1741888064}, 'dependency_snapshots': {'limit': 100, 'used': 0, 'remaining': 100, 'reset': 1741884524}, 'audit_log': {'limit': 1750, 'used': 0, 'remaining': 1750, 'reset': 1741888064}, 'audit_log_streaming': {'limit': 15, 'used': 0,

In [2427]:
analyze_repositories(csv_file,100,378)

***************** Project 1 **************
Analyzing repository: https://api.github.com/repos/hypertrace/hypertrace-service/contents/
in default branch https://api.github.com/repos/hypertrace/hypertrace-service
tree_url https://api.github.com/repos/hypertrace/hypertrace-service/git/trees/main?recursive=1
Number of detected microservices: 1
✅ Dockerfile updated in: projects_with_docker/hypertrace-service/hypertrace-service/Dockerfile
***************** Project 2 **************
Analyzing repository: https://api.github.com/repos/moira-alert/moira/contents/
in default branch https://api.github.com/repos/moira-alert/moira
tree_url https://api.github.com/repos/moira-alert/moira/git/trees/master?recursive=1
Number of detected microservices: 0
***************** Project 3 **************
Analyzing repository: https://api.github.com/repos/HubSpot/Baragon/contents/
in default branch https://api.github.com/repos/HubSpot/Baragon
tree_url https://api.github.com/repos/HubSpot/Baragon/git/trees/master?r

### Retreive topic

In [1535]:
def load_topics_from_json(file_path):
    with open(file_path, "r") as f:
        topics_dict = json.load(f)
    return topics_dict

In [2287]:
# Step 1: Load topics from JSON
def load_topics_from_json(file_path):
    with open(file_path, "r") as f:
        topics_dict = json.load(f)
    return topics_dict

# Step 2: Build BM25 index for sparse retrieval
def build_bm25_index(topics_dict):
    documents = []
    topic_ids = []
    for topic_id, topic_data in topics_dict.items():
        documents.extend(topic_data["representative_documents"])
        topic_ids.extend([topic_id] * len(topic_data["representative_documents"]))

    # Tokenize documents for BM25
    tokenized_docs = [doc.split() for doc in documents]
    bm25_index = BM25Okapi(tokenized_docs)
    return bm25_index, topic_ids
# Step 3: Sparse retrieval using BM25
def sparse_retrieval(query, bm25_index, bm25_topic_ids, topics_dict):
    # Tokenize the query
    tokenized_query = query.split()

    # Get BM25 scores for the query
    bm25_scores = bm25_index.get_scores(tokenized_query)

    # Map scores to topics
    topic_scores = {}
    for doc_id, score in enumerate(bm25_scores):
        topic_id = bm25_topic_ids[doc_id]
        if topic_id not in topic_scores or score > topic_scores[topic_id]:
            topic_scores[topic_id] = score

    # Find the topic with the highest score
    most_relevant_topic_id = max(topic_scores, key=topic_scores.get)
    return topics_dict[most_relevant_topic_id]

In [2289]:


# # Example query
# query = "Create a Dockerfile for a Python web app using Flask"

# # Retrieve the most relevant topic using sparse retrieval
# relevant_topic = sparse_retrieval(query, bm25_index, bm25_topic_ids, topics_dict)

# # Print the results
# print("Relevant Topic Keywords:", relevant_topic["keywords"])
# print("Representative Dockerfiles:", relevant_topic["representative_documents"])

## Step 2.2 

In [2292]:
# Function to load topics from JSON
def load_topics_from_json(knowledge_base_path):
    with open(knowledge_base_path, "r") as file:
        return json.load(file)

In [2294]:
# Function to load extracted data from JSON file
def load_extracted_data(json_file_path):
    with open(json_file_path, "r") as file:
        return json.load(file)

In [2296]:
# Function to load the prompt template from a text file
def load_prompt_template(prompt_file_path):
    with open(prompt_file_path, "r") as file:
        return file.read()

In [2298]:
# Function to load docker data from JSON file
def load_docker_data(docker_data_path):
    with open(docker_data_path, "r") as file:
        return json.load(file)

In [2325]:
def format_prompt(prompt_template, retrieved_data, user_data, retrieved_topic=None):
    """
    Formats the prompt by injecting extracted data into the template.

    Args:
        prompt_template (str): The base template for the prompt.
        retrieved_data (dict): Extracted information from the source code.
        user_data (dict): Additional user-provided inputs.
        retrieved_topic (dict, optional): Retrieved topics for context.

    Returns:
        str: The formatted prompt.
    """
    # Safely extract source code info
    source_info = retrieved_data.get('source_code_info', {})

    def safe_get(key, default="Unknown"):
        """Returns a safely retrieved value from source_info or a default value."""
        value = source_info.get(key, default)
        return value if isinstance(value, (str, list, dict)) else default

    # Extract and handle data safely
    language = safe_get("language")
    framework = safe_get("framework")
    dependencies = ", ".join(safe_get("dependencies", []))
    database = safe_get("database", "None")
    build_system = safe_get("build_system")
    config_files = ", ".join(safe_get("config_files", []))

    # Handle static files safely
    static_files_data = source_info.get("static_files", {})
    if not isinstance(static_files_data, dict):
        static_files_data = {}
    static_files = ", ".join(static_files_data.keys())  # Extract filenames

    # Limit static file names to first 10 entries
    static_files_names = ", ".join(list(static_files_data.keys())[:10])  

    # Include microservice structure (fallback to empty dict if missing)
    project_structure = retrieved_data.get('project_structure', {})
    if isinstance(project_structure, list):
        paths = [item['path'] for item in project_structure]
        project_structure = paths[:100]  # Limit list to 100 elements

    # Extract Docker info safely
    docker_info = user_data.get('docker_info', {})

    def safe_list_to_str(data, key=None):
        if isinstance(data, dict):
            value = data.get(key, [])
        elif isinstance(data, list):
            value = data
        else:
            return ""
    
        if isinstance(value, list):
            # Handle nested lists (e.g., RUN commands)
            if all(isinstance(item, list) for item in value):
                return "\n".join(["\n".join(item) for item in value])
            # Handle dictionaries (e.g., execution commands)
            elif all(isinstance(item, dict) for item in value):
                return "\n".join([
                    f"{item['command']}: {', '.join(item['arguments'])}"
                    for item in value
                ])
            else:
                return "\n".join(value)
        else:
            return str(value)

    # Handle truncated representative documents
    if retrieved_topic and "representative_documents" in retrieved_topic:
        truncated_documents = [
            doc[:100] + "..." if len(doc) > 100 else doc
            for doc in retrieved_topic.get("representative_documents", [])
        ]
        retrieved_topic["representative_documents"] = truncated_documents

    # Format the prompt
    # Format the prompt
    return prompt_template.format(
        language=language,
        framework=framework,
        dependencies=dependencies,
        database=database,
        build_system=build_system,
        config_files=config_files,
        static_files=static_files_names,  # Only first 10 static files
        microservice_structure=project_structure,  # Match the placeholder name in the template
        base_image=docker_info.get('base_image', ''),
        multi_stage_build=docker_info.get('multi_stage_build', False),
        security=docker_info.get('security', ''),
        deployment_environment=docker_info.get('deployment_environment', ''),
        networking=docker_info.get('networking', ''),
        health_checks=safe_list_to_str(docker_info, 'health_checks'),
        exposed_ports=safe_list_to_str(docker_info, 'exposed_ports'),
        build_args=safe_list_to_str(docker_info, 'build_args'),
        copy_instructions=safe_list_to_str(docker_info, 'copy_instructions'),
        execution=safe_list_to_str(docker_info.get('execution', [])),  # Handle execution commands
        run_commands=safe_list_to_str(docker_info.get('run', [])),  # Handle RUN commands
        retrieved_topic=retrieved_topic or "No additional information retrieved.",
        user=safe_list_to_str(docker_info, 'user'),
        envs=safe_list_to_str(docker_info, 'envs'),
        labels=safe_list_to_str(docker_info, 'labels'),
    )


In [2327]:
def generate_dockerfile(prompt):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content



In [2353]:
import re

def extract_dockerfile(text):
    """
    Extracts only the Dockerfile content from a mixed text input.
    
    Args:
        text (str): The input text containing Dockerfile and extra notes.
    
    Returns:
        str: The extracted Dockerfile content.
    """
    # Regular expression to match everything between ```Dockerfile and ```
    match = re.search(r"```Dockerfile\n?(.*?)```", text, re.DOTALL)    
    if match:
        return match.group(1).strip()  # Extract only the Dockerfile content
    else:
        print("No Dockerfile content found.")
        return text

In [2355]:
def save_dockerfile(content, file_path="Dockerfile"):
    """
    Saves the extracted Dockerfile content to a file.
    
    Args:
        content (str): The Dockerfile content.
        file_path (str, optional): The path to save the Dockerfile. Defaults to "Dockerfile".
    """
    if content:
        content=extract_dockerfile(content)
        try:
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(content.strip() + "\n")  # Ensure a newline at the end
            print(f" Dockerfile saved successfully: {file_path}")
        except Exception as e:
            print(f" Error saving Dockerfile: {e}")
    else:
        print(" No content to save.")

## Generate Dockerfile

In [2405]:
# Main function to generate Dockerfiles for all projects and microservices
def generate_dockerfiles_for_projects(projects_root_dir, prompt_template_with_topic_path, prompt_template_without_topic_path, knowledge_base_path):
    # Load both prompt templates
    prompt_template_with_topic = load_prompt_template(prompt_template_with_topic_path)
    prompt_template_without_topic = load_prompt_template(prompt_template_without_topic_path)
    
    # Load topics from JSON
    topics_dict = load_topics_from_json(knowledge_base_path)
    
    # Build BM25 index
    bm25_index, bm25_topic_ids = build_bm25_index(topics_dict)
    
    # Traverse all subdirectories under /projects
    for root, dirs, files in os.walk(projects_root_dir):
        # Check if the current directory contains both retrieved_data.json and docker_data.json
        dockerfile_topic_path = os.path.join(root, "Dockerfile_MiDKo_topic")
        dockerfile_no_topic_path = os.path.join(root, "Dockerfile_MiDKo")
        
        if os.path.exists(dockerfile_topic_path) or os.path.exists(dockerfile_no_topic_path):
            print(f"Skipping {root}, Dockerfile already exists.")
            continue
        retrieved_data_path = os.path.join(root, "extracted_data.json")
        docker_data_path = os.path.join(root, "docker_data.json")
        print(f"******** {root}")
        if os.path.exists(retrieved_data_path) and os.path.exists(docker_data_path):
            # Load retrieved_data and docker_data
            retrieved_data = load_extracted_data(retrieved_data_path)
            userdata = load_extracted_data(docker_data_path)
            
            # Generate query for sparse retrieval
            query = f"Create a Dockerfile for {retrieved_data['source_code_info']['language']} app using {retrieved_data['source_code_info']['framework']} and {retrieved_data['source_code_info']['build_system']}"
            retrieved_topic = sparse_retrieval(query, bm25_index, bm25_topic_ids, topics_dict)
            
            # Format prompt with topics
            prompt_with_topics = format_prompt(prompt_template_with_topic, retrieved_data, userdata, retrieved_topic)
            dockerfile_with_topics = generate_dockerfile(prompt_with_topics)
            save_dockerfile(dockerfile_with_topics, os.path.join(root, "Dockerfile_MiDKo_topic"))
            
            # Format prompt without topics
            prompt_without_topics = format_prompt(prompt_template_without_topic, retrieved_data, userdata)
            dockerfile_without_topics = generate_dockerfile(prompt_without_topics)
            save_dockerfile(dockerfile_without_topics, os.path.join(root, "Dockerfile_MiDKo"))


In [None]:
projects_root_dir = "./inter"
generate_dockerfiles_for_projects(projects_root_dir, prompt_template_with_topic_path, prompt_template_without_topic_path,knowledge_base_path)