# Syntactic Correctness for Dockerfiles using Hadolint 

## Whitten by devs

In [3]:
import os
import subprocess
import json
from collections import defaultdict, Counter
import pandas as pd

In [5]:
# /opt/homebrew/bin/hadolint

In [7]:
def run_hadolint(dockerfile_path):
    """Run hadolint on the Dockerfile and return the JSON output."""
    dockerfile_path = os.path.abspath(dockerfile_path)
    #print(f"Running hadolint on: {dockerfile_path}")  # Debug statement

    if not os.path.exists(dockerfile_path):
        print(f"Error: File not found: {dockerfile_path}")
        return None

    try:
        result = subprocess.run(
            ["/opt/homebrew/bin/hadolint", dockerfile_path, "--format", "json"],
            capture_output=True,
            text=True,
            encoding="utf-8",
            check=False  # Do not raise an exception for non-zero exit codes
        )

        # Check if hadolint produced valid output
        if result.returncode != 0 and not result.stdout.strip():
            # If there's no output and the return code is non-zero, it's a real error
            print(f"Error: Hadolint failed to run on {dockerfile_path}.")
            print(f"STDERR: {result.stderr}")
            print(f"STDOUT: {result.stdout}")
            return None

        # Parse and return the JSON output
        return json.loads(result.stdout)
    except Exception as e:
        print(f"Unexpected error while running hadolint on {dockerfile_path}: {e}")
        return None

In [71]:
import os
import pandas as pd
from collections import defaultdict, Counter

def analyze_dockerfiles(root_dir):
    """Analyze all Dockerfiles in the directory and aggregate results."""
    dockerfiles = find_dockerfiles(root_dir)
    if not dockerfiles:
        print("No Dockerfiles found.")
        return

    # Initialize counters
    file_types = defaultdict(lambda: {
        "files": 0,
        "errors": 0,
        "warnings": 0,
        "error_counter": Counter(),
        "warning_counter": Counter(),
        "files_without_errors": 0,  # Track files with no errors (warnings are okay)
        "dl3023_projects": set()  # Track projects with DL1000 errors
    })
    total_files = 0

    results_list = []

    for dockerfile in dockerfiles:
        file_type = identify_dockerfile_type(dockerfile)
        file_types[file_type]["files"] += 1
        total_files += 1

        results = run_hadolint(dockerfile)
        if results is None:
            continue  # Skip failed analysis

        # Count errors and warnings for this file
        errors = sum(1 for issue in results if issue["level"] == "error")
        warnings = sum(1 for issue in results if issue["level"] == "warning")
        file_types[file_type]["errors"] += errors
        file_types[file_type]["warnings"] += warnings

        project_dir = os.path.dirname(dockerfile)

        # Track most common errors and warnings
        for issue in results:
            if issue["level"] == "error":
                file_types[file_type]["error_counter"][issue["code"]] += 1
                if issue["code"] == "DL3023":
                    file_types[file_type]["dl3023_projects"].add(project_dir)
            elif issue["level"] == "warning":
                file_types[file_type]["warning_counter"][issue["code"]] += 1

        # Track files without errors (warnings are acceptable)
        if errors == 0:
            file_types[file_type]["files_without_errors"] += 1

        # Append results for this file
        results_list.append({
            "file": dockerfile,
            "errors": errors,
            "warnings": warnings,
            "type": file_type
        })

    # Convert results to DataFrame
    df_results = pd.DataFrame(results_list)

    # Print summary
    print(f"Total Dockerfiles analyzed: {total_files}")
    for file_type, stats in file_types.items():
        print(f"\nStatistics for {file_type}:")
        print(f"  Files analyzed: {stats['files']}")
        print(f"  Files without any errors: {stats['files_without_errors']}")
        print(f"  Total errors: {stats['errors']}")
        print(f"  Total warnings: {stats['warnings']}")
        print(f"  Projects with dl3023 errors: {len(stats['dl3023_projects'])}")
       
        print("  Most common errors:")
        for code, count in stats["error_counter"].most_common(5):
            print(f"    {code}: {count} occurrences")
        print("  Most common warnings:")
        for code, count in stats["warning_counter"].most_common(5):
            print(f"    {code}: {count} occurrences")

    return df_results, file_types

def find_dockerfiles(root_dir):
    """Find all Dockerfiles starting with 'Dockerfile' recursively."""
    dockerfiles = []
    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.startswith("Dockerfile"):
                dockerfiles.append(os.path.join(root, file))
    return dockerfiles

def identify_dockerfile_type(filename):
    """Identify the type of Dockerfile based on its name."""
    if "MiDKo_topic" in filename:
        return "Dockerfile_MiDKo_topic"
    elif "MiDKo" in filename:
        return "Dockerfile_MiDKo"
    else:
        return "Dockerfile"


In [79]:
# Specify the root directory containing your Dockerfiles
root_directory = "../projects_with_docker"

# Run the analysis
analyze_dockerfiles(root_directory)

Total Dockerfiles analyzed: 4829

Statistics for Dockerfile_MiDKo:
  Files analyzed: 1606
  Files without any errors: 1459
  Total errors: 153
  Projects with dl3023 errors: 2
  Most common errors:
    DL1000: 72 occurrences
    SC1072: 22 occurrences
    DL3044: 17 occurrences
    SC1089: 9 occurrences
    DL3021: 8 occurrences
    DL3045: 328 occurrences
    DL3008: 305 occurrences
    DL3018: 227 occurrences
    DL3022: 219 occurrences
    DL3042: 170 occurrences

Statistics for Dockerfile_MiDKo_topic:
  Files analyzed: 1607
  Files without any errors: 1448
  Total errors: 173
  Projects with dl3023 errors: 0
  Most common errors:
    DL1000: 64 occurrences
    SC1072: 35 occurrences
    DL3000: 27 occurrences
    DL3044: 11 occurrences
    DL3004: 9 occurrences
    DL3008: 331 occurrences
    DL3045: 323 occurrences
    DL3018: 319 occurrences
    DL3042: 179 occurrences
    DL4006: 130 occurrences

Statistics for Dockerfile:
  Files analyzed: 1616
  Files without any errors: 1203


 0     ../projects_with_docker/TodoMVC-DDD-CQRS-Event...       0         0   
 1     ../projects_with_docker/TodoMVC-DDD-CQRS-Event...       0         0   
 2     ../projects_with_docker/TodoMVC-DDD-CQRS-Event...       0         0   
 3     ../projects_with_docker/TodoMVC-DDD-CQRS-Event...       0         0   
 4     ../projects_with_docker/TodoMVC-DDD-CQRS-Event...       0         0   
 ...                                                 ...     ...       ...   
 4824  ../projects_with_docker/coding-earth/web/Docke...       0         0   
 4825  ../projects_with_docker/coding-earth/web/Docke...       0         0   
 4826  ../projects_with_docker/kitsu-tools/router/Doc...       0         1   
 4827  ../projects_with_docker/kitsu-tools/router/Doc...       0         1   
 4828  ../projects_with_docker/kitsu-tools/router/Doc...       0         1   
 
                         type  
 0           Dockerfile_MiDKo  
 1     Dockerfile_MiDKo_topic  
 2                 Dockerfile  
 3          