In [1]:
import numpy as np

# Assume these are the commit sizes for two commits where the file is present
commit_sizes_present = np.array([10, 15])

# Number of bootstrap samples
n_bootstrap_samples = 1000

# Bootstrap sampling
bootstrap_means = np.random.choice(commit_sizes_present, (n_bootstrap_samples, len(commit_sizes_present)), replace=True).mean(axis=1)

# Calculate confidence intervals
conf_interval = np.percentile(bootstrap_means, [2.5, 97.5])

print(f"Bootstrap mean: {np.mean(bootstrap_means)}")
print(f"95% confidence interval: {conf_interval}")

Bootstrap mean: 12.45
95% confidence interval: [10. 15.]


In [5]:
from statsmodels.stats.power import TTestIndPower

# Define parameters
effect_size = 0.5  # Cohen's d, a medium effect size
alpha = 0.05
power = 0.8

# Calculate sample size
analysis = TTestIndPower()
sample_size = analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power, alternative='two-sided')

print(f"Required sample size per group: {sample_size}")

Required sample size per group: 63.765611775409695


In [8]:
import numpy as np
import statsmodels.stats.power as smp

# Parameters
effect_size = 0.7  # Example medium effect size
alpha = 0.05
power = 0.8

# Calculate the sample size for t-test (as an approximation)
analysis = smp.TTestIndPower()
sample_size_ttest = analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power, alternative='two-sided')

print(f"Approximate required sample size per group (t-test): {sample_size_ttest}")

# Convert t-test sample size to Mann-Whitney U test sample size using a correction factor
# According to some literature, the sample size for Mann-Whitney U can be about 1.15 times that of a t-test for similar power
sample_size_mannwhitney = sample_size_ttest * 1.15

print(f"Approximate required sample size per group (Mann-Whitney U test): {sample_size_mannwhitney}")


Approximate required sample size per group (t-test): 33.0245695315096
Approximate required sample size per group (Mann-Whitney U test): 37.97825496123603


In [15]:
from scipy.stats import mannwhitneyu
import numpy as np

# Function to perform simulation
def power_simulation(effect_size, n1, n2, alpha, num_simulations=1000):
    power_count = 0
    for _ in range(num_simulations):
        group1 = np.random.normal(0, 1, n1)
        group2 = np.random.normal(effect_size, 1, n2)
        stat, p_value = mannwhitneyu(group1, group2, alternative='two-sided')
        if p_value < alpha:
            power_count += 1
    return power_count / num_simulations

# Initial parameters
effect_size = 0.8
alpha = 0.05
power_target = 0.8
num_simulations = 5000

# Iteratively find the sample size
sample_size = 1
achieved_power = 0
while achieved_power < power_target:
    achieved_power = power_simulation(effect_size, sample_size, sample_size, alpha, num_simulations)
    sample_size += 1

print(f"Required sample size per group: {sample_size - 1}")


Required sample size per group: 28


In [16]:
import numpy as np
from scipy.stats import ttest_1samp

# Given sets
R = {"commit1", "commit2", "commit3", "commit4"}
F = {"file1", "file2", "file3", "file4", "file5"}
C = {("commit1", "file1"), ("commit1", "file2"), ("commit2", "file1"), 
     ("commit2", "file3"), ("commit3", "file2"), ("commit3", "file3"), 
     ("commit3", "file4"), ("commit4", "file5")}

# Step 1: Calculate commit sizes
commit_sizes = {r: 0 for r in R}
for (r, f) in C:
    commit_sizes[r] += 1

# Step 2: Collect commit sizes for each file
file_commit_sizes = {f: [] for f in F}
for (r, f) in C:
    file_commit_sizes[f].append(commit_sizes[r])

# Step 3: Calculate average commit size for each file
file_avg_commit_size = {f: np.mean(sizes) if sizes else 0 for f, sizes in file_commit_sizes.items()}

# Step 4: Rank files based on average commit size
file_ranks = sorted(file_avg_commit_size.items(), key=lambda x: x[1], reverse=True)

# Step 5: Perform hypothesis testing to assign confidence scores
all_commit_sizes = list(commit_sizes.values())
file_confidences = {}
alpha = 0.05  # Significance level

for f, sizes in file_commit_sizes.items():
    if sizes:
        t_stat, p_value = ttest_1samp(sizes, np.mean(all_commit_sizes))
        file_confidences[f] = p_value
    else:
        file_confidences[f] = 1  # High p-value for files with no changes

# Step 6: Determine threshold for significance
threshold_confidence = alpha
threshold_commit_size = np.mean(all_commit_sizes) + np.std(all_commit_sizes)

# Filter files by confidence threshold
significant_files = [f for f, p_value in file_confidences.items() if p_value < threshold_confidence]

# Display the results
print("File Rankings (sorted by rank):")
for rank, (file, avg_size) in enumerate(file_ranks, 1):
    print(f"{rank}. {file} - Average Commit Size: {avg_size}, Confidence: {file_confidences[file]:.4f}")

print(f"\nThreshold (Confidence): {threshold_confidence}")
print(f"Threshold (Commit Size): {threshold_commit_size:.2f}")
print("\nSignificant Files (based on confidence threshold):", significant_files)


File Rankings (sorted by rank):
1. file4 - Average Commit Size: 3.0, Confidence: nan
2. file3 - Average Commit Size: 2.5, Confidence: 0.5000
3. file2 - Average Commit Size: 2.5, Confidence: 0.5000
4. file1 - Average Commit Size: 2.0, Confidence: nan
5. file5 - Average Commit Size: 1.0, Confidence: nan

Threshold (Confidence): 0.05
Threshold (Commit Size): 2.71

Significant Files (based on confidence threshold): []


  res = hypotest_fun_out(*samples, **kwds)
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
