In [5]:
import polars as pl
from rich.console import  Console
from rich.table import Table
console = Console()

In [None]:
dataset_path = ""
df = pl.read_parquet(dataset_path)
packages_dataset_path = ""
df_packages = pl.read_parquet(packages_dataset_path)

In [9]:
# Schema Inspection

console.print("[bold]Schema Inspection:[/bold]", style="bold")
table = Table(title="DataFrame Schema", show_header=True, header_style="bold magenta")
table.add_column("Column Name", style="cyan", no_wrap=True)
table.add_column("Data Type", style="green")

for name, dtype in df.schema.items():
    table.add_row(name, str(dtype))

console.print(table)


shape_info = f"""
[bold]DataFrame Shape:[/bold] [cyan]({df.shape[0]}, {df.shape[1]})[/cyan]
[bold]Number of source files:[/bold] [green]{df.shape[0]}[/green]
[bold]Number of columns:[/bold] [green]{df.shape[1]}[/green]
"""

console.print(shape_info)

In [12]:
# Average Source Files per Package Analysis

console.print("\n[bold][cyan]Source Files per Package Analysis:[/cyan][/bold]", style="bold")

# Count source files per package
files_per_package = (df
    .group_by("package_name")
    .agg([
        pl.len().alias("file_count")
    ])
    .sort("file_count", descending=True)
)

# Calculate statistics
total_packages = files_per_package.height
total_files = df.height
avg_files_per_package = total_files / total_packages
min_files = files_per_package.get_column("file_count").min()
max_files = files_per_package.get_column("file_count").max()
median_files = files_per_package.get_column("file_count").median()

summary_table = Table(title="Source Files per Package Summary", show_header=True, header_style="bold magenta")
summary_table.add_column("Metric", style="cyan", no_wrap=True)
summary_table.add_column("Value", style="green", justify="right")

summary_table.add_row("Total Packages", f"{total_packages}")
summary_table.add_row("Total Source Files", f"{total_files}")
summary_table.add_row("Average Files per Package", f"{avg_files_per_package:.2f}")
summary_table.add_row("Median Files per Package", f"{median_files:.1f}")
summary_table.add_row("Min Files per Package", f"{min_files}")
summary_table.add_row("Max Files per Package", f"{max_files}")

console.print(summary_table)

# Show top 10 packages with most files
console.print("\n[bold][cyan]Top 10 Packages with Most Source Files:[/cyan][/bold]", style="bold")

top_packages = files_per_package.head(10)

top_table = Table(title="Packages with Most Source Files", show_header=True, header_style="bold magenta")
top_table.add_column("Package Name", style="cyan", no_wrap=False, width=30)
top_table.add_column("Number of Files", style="green", justify="right")

for row in top_packages.iter_rows():
    package_name, file_count = row
    top_table.add_row(package_name, f"{file_count}")

console.print(top_table)

In [15]:
# Skipped Source Files Analysis

console.print("\n[bold][cyan]Skipped Source Files Analysis:[/cyan][/bold]", style="bold")

cmake_files = ["CMakeCCompilerId.c", "CMakeCXXCompilerId.cpp", "CMakeCCompilerABI.c", "CMakeCXXCompilerABI.cpp"]
test_directories = ["tests", "test", "t", "testing", "unittest", "ctest", "check", "test-suite", "testsuite", "regression"]

df_with_analysis = df.with_columns([
    pl.col("file_path").str.split("/").list.last().alias("file_name"),
    pl.col("file_path").str.split("/").list.slice(-2, 1).list.first().alias("parent_directory")
])

# Identify skipped files
skipped_files = df_with_analysis.filter(
    pl.col("file_name").is_in(cmake_files) |
    pl.col("parent_directory").is_in(test_directories) |
    pl.col("file_name").str.to_lowercase().str.contains("test") |
    pl.col("file_name").str.to_lowercase().str.contains("testing")
)

# Count files without output
files_without_output = df.filter(
    (pl.col("output_file").is_null()) | (pl.col("output_file") == "")
)

total_files = df.height
skipped_count = skipped_files.height
skipped_percentage = (skipped_count / total_files * 100)
no_output_count = files_without_output.height
no_output_percentage = (no_output_count / total_files * 100)

# Summary table
skip_summary = Table(title="Source Files Skip Analysis Summary", show_header=True, header_style="bold magenta")
skip_summary.add_column("Category", style="cyan", no_wrap=True)
skip_summary.add_column("Count", style="green", justify="right")
skip_summary.add_column("Percentage", style="yellow", justify="right")

skip_summary.add_row("Total Source Files", f"{total_files}", "100.00%")
skip_summary.add_row("Files Skipped", f"{skipped_count}", f"{skipped_percentage:.2f}%")
skip_summary.add_row("Files without Output", f"{no_output_count}", f"{no_output_percentage:.2f}%")

console.print(skip_summary)

In [19]:
# Success Rate Analysis

console.print("\n[bold][cyan]Success Rate Analysis:[/cyan][/bold]", style="bold")

# Calculate success rates for each metric
success_analysis = df.with_columns([
    (pl.col("src_functions").list.len() > 0).alias("src_functions_success"),
    
    (pl.col("ir_functions").list.len() > 0).alias("ir_functions_success"),
    
    (pl.col("random_function").is_not_null()).alias("random_function_success"),
    
    ((pl.col("IR_generation_return_code") == 0) & 
    (pl.col("LLVM_IR").is_not_null()) & 
    (pl.col("LLVM_IR") != "")).alias("llvm_ir_success"),
    
    ((pl.col("random_function_IR_generation_return_code") == 0) & 
    (pl.col("random_function_IR").is_not_null()) & 
    (pl.col("random_function_IR") != "")).alias("random_function_ir_success"),
    
    (pl.col("object_file_generation_return_code") == 0).alias("object_file_success"),
    
    (pl.col("relinked_llvm_ir").is_not_null() & (pl.col("relinked_llvm_ir") != "")).alias("relinked_ir_success"),
    
    (pl.col("modified_object_file_generation_return_code") == 0).alias("modified_object_success")
])

# Calculate per-package averages
package_averages = (success_analysis
    .group_by("package_name")
    .agg([
        pl.col("src_functions_success").mean().alias("src_functions_avg"),
        pl.col("ir_functions_success").mean().alias("ir_functions_avg"),
        pl.col("random_function_success").mean().alias("random_function_avg"),
        pl.col("llvm_ir_success").mean().alias("llvm_ir_avg"),
        pl.col("random_function_ir_success").mean().alias("random_function_ir_avg"),
        pl.col("object_file_success").mean().alias("object_file_avg"),
        pl.col("relinked_ir_success").mean().alias("relinked_ir_avg"),
        pl.col("modified_object_success").mean().alias("modified_object_avg")
    ])
)

# Calculate overall averages (across all source files)
overall_averages = success_analysis.select([
    pl.col("src_functions_success").mean().alias("src_functions_overall"),
    pl.col("ir_functions_success").mean().alias("ir_functions_overall"),
    pl.col("random_function_success").mean().alias("random_function_overall"),
    pl.col("llvm_ir_success").mean().alias("llvm_ir_overall"),
    pl.col("random_function_ir_success").mean().alias("random_function_ir_overall"),
    pl.col("object_file_success").mean().alias("object_file_overall"),
    pl.col("relinked_ir_success").mean().alias("relinked_ir_overall"),
    pl.col("modified_object_success").mean().alias("modified_object_overall")
])

# Calculate average per package (mean of package averages)
avg_per_package = package_averages.select([
    pl.col("src_functions_avg").mean().alias("src_functions_pkg_avg"),
    pl.col("ir_functions_avg").mean().alias("ir_functions_pkg_avg"),
    pl.col("random_function_avg").mean().alias("random_function_pkg_avg"),
    pl.col("llvm_ir_avg").mean().alias("llvm_ir_pkg_avg"),
    pl.col("random_function_ir_avg").mean().alias("random_function_ir_pkg_avg"),
    pl.col("object_file_avg").mean().alias("object_file_pkg_avg"),
    pl.col("relinked_ir_avg").mean().alias("relinked_ir_pkg_avg"),
    pl.col("modified_object_avg").mean().alias("modified_object_pkg_avg")
])

# Create the summary table
success_table = Table(title="Success Rate Analysis", show_header=True, header_style="bold magenta")
success_table.add_column("Metric", style="cyan", no_wrap=False, width=35)
success_table.add_column("Average per Package", style="green", justify="right")
success_table.add_column("Average over All Files", style="yellow", justify="right")

pkg_avg = avg_per_package.row(0)
overall_avg = overall_averages.row(0)

metrics = [
    ("Source Function Extraction", pkg_avg[0], overall_avg[0]),
    ("IR Function Extraction", pkg_avg[1], overall_avg[1]),
    ("Random Function Selection", pkg_avg[2], overall_avg[2]),
    ("LLVM IR Generation", pkg_avg[3], overall_avg[3]),
    ("Random Function IR Generation", pkg_avg[4], overall_avg[4]),
    ("Object File Generation", pkg_avg[5], overall_avg[5]),
    ("Relinked LLVM IR Generation", pkg_avg[6], overall_avg[6]),
    ("Modified Object File Generation", pkg_avg[7], overall_avg[7])
]

for metric_name, pkg_rate, overall_rate in metrics:
    success_table.add_row(
        metric_name,
        f"{pkg_rate:.2%}" if pkg_rate is not None else "N/A",
        f"{overall_rate:.2%}" if overall_rate is not None else "N/A"
    )

console.print(success_table)

In [22]:
# Top/Bottom Packages by Success Rate Analysis

console.print("\n[bold][cyan]Packages with Highest and Lowest Success Rates:[/cyan][/bold]", style="bold")

# Calculate overall success rate per package (average across all metrics)
package_overall_success = package_averages.with_columns([
    ((pl.col("src_functions_avg") + pl.col("ir_functions_avg") + pl.col("random_function_avg") + 
      pl.col("llvm_ir_avg") + pl.col("random_function_ir_avg") + pl.col("object_file_avg") + 
      pl.col("relinked_ir_avg") + pl.col("modified_object_avg")) / 8).alias("overall_success_rate")
]).sort("overall_success_rate")

# Get top 5 lowest and highest
lowest_3 = package_overall_success.head(5)
highest_3 = package_overall_success.tail(5).sort("overall_success_rate", descending=True)

# Table for lowest success rates
console.print("\n[bold][red]Top 5 Packages with Lowest Success Rates:[/red][/bold]", style="bold")

lowest_table = Table(title="Packages with Lowest Success Rates", show_header=True, header_style="bold magenta")
lowest_table.add_column("Package Name", style="cyan", no_wrap=False, width=25)
lowest_table.add_column("Src Func", style="red", justify="right", width=8)
lowest_table.add_column("IR Func", style="red", justify="right", width=8)
lowest_table.add_column("Rand Func", style="red", justify="right", width=9)
lowest_table.add_column("LLVM IR", style="red", justify="right", width=8)
lowest_table.add_column("Rand IR", style="red", justify="right", width=8)
lowest_table.add_column("Obj File", style="red", justify="right", width=8)
lowest_table.add_column("Relink", style="red", justify="right", width=7)
lowest_table.add_column("Mod Obj", style="red", justify="right", width=7)
lowest_table.add_column("Overall", style="bold red", justify="right", width=8)

for row in lowest_3.iter_rows():
    package_name, src_f, ir_f, rand_f, llvm_ir, rand_ir, obj_f, relink, mod_obj, overall = row
    lowest_table.add_row(
        package_name,
        f"{src_f:.1%}" if src_f is not None else "N/A",
        f"{ir_f:.1%}" if ir_f is not None else "N/A",
        f"{rand_f:.1%}" if rand_f is not None else "N/A",
        f"{llvm_ir:.1%}" if llvm_ir is not None else "N/A",
        f"{rand_ir:.1%}" if rand_ir is not None else "N/A",
        f"{obj_f:.1%}" if obj_f is not None else "N/A",
        f"{relink:.1%}" if relink is not None else "N/A",
        f"{mod_obj:.1%}" if mod_obj is not None else "N/A",
        f"{overall:.1%}" if overall is not None else "N/A"
    )

console.print(lowest_table)

# Table for highest success rates
console.print("\n[bold][green]Top 5 Packages with Highest Success Rates:[/green][/bold]", style="bold")

highest_table = Table(title="Packages with Highest Success Rates", show_header=True, header_style="bold magenta")
highest_table.add_column("Package Name", style="cyan", no_wrap=False, width=25)
highest_table.add_column("Src Func", style="green", justify="right", width=8)
highest_table.add_column("IR Func", style="green", justify="right", width=8)
highest_table.add_column("Rand Func", style="green", justify="right", width=9)
highest_table.add_column("LLVM IR", style="green", justify="right", width=8)
highest_table.add_column("Rand IR", style="green", justify="right", width=8)
highest_table.add_column("Obj File", style="green", justify="right", width=8)
highest_table.add_column("Relink", style="green", justify="right", width=7)
highest_table.add_column("Mod Obj", style="green", justify="right", width=7)
highest_table.add_column("Overall", style="bold green", justify="right", width=8)

for row in highest_3.iter_rows():
    package_name, src_f, ir_f, rand_f, llvm_ir, rand_ir, obj_f, relink, mod_obj, overall = row
    highest_table.add_row(
        package_name,
        f"{src_f:.1%}" if src_f is not None else "N/A",
        f"{ir_f:.1%}" if ir_f is not None else "N/A",
        f"{rand_f:.1%}" if rand_f is not None else "N/A",
        f"{llvm_ir:.1%}" if llvm_ir is not None else "N/A",
        f"{rand_ir:.1%}" if rand_ir is not None else "N/A",
        f"{obj_f:.1%}" if obj_f is not None else "N/A",
        f"{relink:.1%}" if relink is not None else "N/A",
        f"{mod_obj:.1%}" if mod_obj is not None else "N/A",
        f"{overall:.1%}" if overall is not None else "N/A"
    )

console.print(highest_table)

In [31]:
# Test-Passed Packages Success Rate Analysis

console.print("\n[bold][cyan]Success Rates for Test-Passed Packages:[/cyan][/bold]", style="bold")

# Filter packages where test_passed = 1
successful_test_packages = df_packages.filter(pl.col("test_passed") == 1)
successful_package_names = successful_test_packages.get_column("name").to_list()

test_passed_package_averages = package_averages.filter(
    pl.col("package_name").is_in(successful_package_names)
)

source_files_from_successful_packages = df.filter(pl.col("package_name").is_in(successful_package_names))

# Summary statistics
total_packages = df_packages.height
test_passed_packages = successful_test_packages.height
test_passed_percentage = (test_passed_packages / total_packages * 100) if total_packages > 0 else 0

total_source_files = df.height
source_files_from_test_passed = source_files_from_successful_packages.height
source_files_percentage = (source_files_from_test_passed / total_source_files * 100) if total_source_files > 0 else 0

summary_stats = Table(title="Test Success Summary", show_header=True, header_style="bold magenta")
summary_stats.add_column("Metric", style="cyan", no_wrap=True)
summary_stats.add_column("Value", style="green", justify="right")

summary_stats.add_row("Total Packages", f"{total_packages}")
summary_stats.add_row("Packages with test_passed = 1", f"{test_passed_packages}")
summary_stats.add_row("Test Success Rate", f"{test_passed_percentage:.2f}%")
summary_stats.add_row("Total Source Files", f"{total_source_files}")
summary_stats.add_row("Source Files from Test-Passed Packages", f"{source_files_from_test_passed}")
summary_stats.add_row("Percentage of Source Files", f"{source_files_percentage:.2f}%")

console.print(summary_stats)

# Calculate overall success rate per test-passed package (average across all metrics)
test_passed_overall_success = test_passed_package_averages.with_columns([
    ((pl.col("src_functions_avg") + pl.col("ir_functions_avg") + pl.col("random_function_avg") + 
    pl.col("llvm_ir_avg") + pl.col("random_function_ir_avg") + pl.col("object_file_avg") + 
    pl.col("relinked_ir_avg") + pl.col("modified_object_avg")) / 8).alias("overall_success_rate")
]).sort("overall_success_rate", descending=True)

# Get top 10 test-passed packages
top_10_test_passed = test_passed_overall_success.head(10)

# Table for top 10 test-passed packages
console.print("\n[bold][green]Top 10 Test-Passed Packages by Success Rate:[/green][/bold]", style="bold")

test_passed_table = Table(title="Top 10 Test-Passed Packages Success Rates", show_header=True, header_style="bold magenta")
test_passed_table.add_column("Package Name", style="cyan", no_wrap=False, width=25)
test_passed_table.add_column("Src Func", style="green", justify="right", width=8)
test_passed_table.add_column("IR Func", style="green", justify="right", width=8)
test_passed_table.add_column("Rand Func", style="green", justify="right", width=9)
test_passed_table.add_column("LLVM IR", style="green", justify="right", width=8)
test_passed_table.add_column("Rand IR", style="green", justify="right", width=8)
test_passed_table.add_column("Obj File", style="green", justify="right", width=8)
test_passed_table.add_column("Relink", style="green", justify="right", width=7)
test_passed_table.add_column("Mod Obj", style="green", justify="right", width=7)
test_passed_table.add_column("Overall", style="bold green", justify="right", width=8)

for row in top_10_test_passed.iter_rows():
    package_name, src_f, ir_f, rand_f, llvm_ir, rand_ir, obj_f, relink, mod_obj, overall = row
    test_passed_table.add_row(
        package_name,
        f"{src_f:.1%}" if src_f is not None else "N/A",
        f"{ir_f:.1%}" if ir_f is not None else "N/A",
        f"{rand_f:.1%}" if rand_f is not None else "N/A",
        f"{llvm_ir:.1%}" if llvm_ir is not None else "N/A",
        f"{rand_ir:.1%}" if rand_ir is not None else "N/A",
        f"{obj_f:.1%}" if obj_f is not None else "N/A",
        f"{relink:.1%}" if relink is not None else "N/A",
        f"{mod_obj:.1%}" if mod_obj is not None else "N/A",
        f"{overall:.1%}" if overall is not None else "N/A"
    )

console.print(test_passed_table)