In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
#Loading the new dataset with annotated reproducibility scores
file_path = "New Annotated Dataset.xlsx" 
sheet_index = 2  #Third sheet from the Excel file

df = pd.read_excel(file_path, sheet_name=sheet_index)

In [3]:
#Variable definitions for easier computation
factors = ['Data availability', 'Inclusion of a ReadMe', 'Trained model inclusion', 'Hyperparameter description', 'Training description',
    'Paper readability', 'Code availability'] #Reproducibility criteria

annotated_col = 'The reproducibility factor from the dataset by Olszewski et al.' #The annotated reproducibility factor

Evaluation of the Created Quantitative Reproducibility Measure for Machine Learning Research Papers:

Normalization of reproducibility criteria scores:

In [None]:
#Normalizing the reproducibility criteria scores with min-max
df_norm = df.copy() #Copy to not change the original data

#Looping through the criteria
for col in factors:
    min_val = df_norm[col].min() #Minimum value
    max_val = df_norm[col].max() #Maximum value
    df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val) #Normalization

Calculating the reproducibility score with WSM:

In [8]:
#Calculating the reproducibility score with WSM. Without the weight selection for the reproducibility criteria, the calculation
#is just a mean.

df_norm['WSM_score'] = df_norm[factors].mean(axis=1)

Applying a threshold for a research paper to be classified as 0 or 1. Because the derived reproducibility scores with WSM were in the interval [0,1], a 0.5 threshold was selected to classify a paper as reproducible (value = 1). This was necessary as the reproducibility factor, from the dataset by D. Olszewski, et al., was provided as a binary variable containing only values of 0 and 1.

In [None]:
#Creating a column for the 0 and 1 labels derived from the WSM reproducibility calculation
df_norm['WSM_reproducible'] = (df_norm['WSM_score'] > 0.5).astype(int) 

#the 0.5 was chosen from a manual sensitivity analysis by increasing the threshold levels from 0 to 1, by 0.1 increments. 
#0.5 provided the most optimal solution with the highest level of accuracy, precision and recall.

Comparing the reproducibility score with WSM and the reproducibility factor, from the dataset by D. Olszewski, et al.:

In [12]:
#Copying the annotated reproducibility column in a dataframe, for easier calculation later
df_norm['Annotated_reproducible'] = df_norm[annotated_col]

#Creating a new column 'Match' that checks if WSM prediction matches the annotated reproducibility factor, for easier calculation later
df_norm['Match'] = (df_norm['WSM_reproducible'] == df_norm['Annotated_reproducible'])

In [13]:
#Calculating total papers, correctly predicted number of papers and accuracy

total_papers = len(df_norm) #Total number of papers
correct = df_norm['Match'].sum() #Sum of paper with matching results
accuracy = correct / total_papers

#Displaying results
print("Total papers:", total_papers)
print("Correct predictions:", correct)
print("Accuracy:", round(accuracy, 4))

Total papers: 139
Correct predictions: 89
Accuracy: 0.6403


In [14]:
#Creating a confusion matrix to see results per reproducible and not reproducible paper type
confusion_matrix = pd.crosstab(
    df_norm['Annotated_reproducible'],
    df_norm['WSM_reproducible'],
    rownames=['Annotated'],
    colnames=['WSM']
)
#Displaying results
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
WSM         0   1
Annotated        
0          63  39
1          11  26


In [15]:
#Calculating precision, recall and f-1
y_true = df_norm['Annotated_reproducible'] #True value labels from the reproducibility factor
y_pred = df_norm['WSM_reproducible'] #The labels derived from the WSM reproducibility calculation 

precision = precision_score(y_true, y_pred) #precision
recall = recall_score(y_true, y_pred) #recall
f1 = f1_score(y_true, y_pred) #f-1

#Displaying results
print("Precision:", round(precision, 4))
print("Recall:", round(recall, 4))
print("F1-score:", round(f1, 4))

Precision: 0.4
Recall: 0.7027
F1-score: 0.5098


Example Analysis of the Quantitative Reproducibility Measure for Machine Learning Research Papers:

Criteria evaluation with the Saaty's importance scale for three different scenarios:

In [18]:
#Scenario importance score vectors for the three weight scenarios
scenario_scores = {
    "Scenario_1_equal": np.array([1, 1, 1, 1, 1, 1, 1], dtype=float), #Equal weights scenario
    "Scenario_2_data_code": np.array([7, 3, 3, 3, 3, 3, 7], dtype=float), #Data and code focused scenario
    "Scenario_3_read_train": np.array([3, 3, 1, 4, 7, 7, 3], dtype=float), #Readability and training focused scenario
}

RI = 1.32  #Random Index for n=7 (n=7 because there are 7 criteria)

Functions to create a pairwise comparison matrix equivalent and for the consistency ratio evaluation for the weights:

In [20]:
#Converting a vector of scores into normalized weights that sum to 1

def scores_to_weights(scores): #Normalizing raw scores so that their sum equals 1
    return scores / scores.sum()

def build_pairwise_matrix(scores): #Building a pairwise comparison matrix from a score vector. Each element A[i, j] = scores[i] / scores[j]
    return scores[:, None] / scores[None, :]

#Get AHP priority weights from the pairwise comparison matrix
def ahp_weights(A):
    A_norm = A / A.sum(axis=0) #Normalization of each column of the matrix
    return A_norm.mean(axis=1) #Mean of each row to obtain the weight vector

#Consistency metrics for the AHP pairwise comparison matrix
def consistency_ratio(A, weights, RI): #Pairwise comparison matrix, AHP-derived weight vector, Random index for matrices of size n
    n = A.shape[0]
    lambda_max = np.mean((A @ weights) / weights) #Maximum eigenvalue
    CI = (lambda_max - n) / (n - 1) #Consistency index
    CR = CI / RI #Consistency ratio
    return lambda_max, CI, CR


Consistency index for the weights in weight scenarios:

In [22]:
#Creating a list for results for each scenario
results = []

#Loop through each scenario and it's score vector
for name, scores in scenario_scores.items():
    A = build_pairwise_matrix(scores) #Creating a AHP pairwise comparison matrix from the score vector
    w = ahp_weights(A) #AHP priority weights from the pairwise matrix
    lambda_max, CI, CR = consistency_ratio(A, w, RI) #Consistency metrics
    results.append([name, lambda_max, CI, CR])

#Converting results into a dataframe for easier results vizualization
cr_df = pd.DataFrame(
    results,
    columns=["Scenario", "lambda_max", "CI", "CR"]
)
#Displaying results
print(cr_df)

                Scenario  lambda_max   CI   CR
0       Scenario_1_equal         7.0  0.0  0.0
1   Scenario_2_data_code         7.0  0.0  0.0
2  Scenario_3_read_train         7.0  0.0  0.0


Normalizing weights for each weight scenario:

In [24]:
#Creating normalized weights for each scenario

weights = {name: scores_to_weights(s) for name, s in scenario_scores.items()} #Each score vector is converted into weights that sum to 1
weights_df = pd.DataFrame(weights, index=factors) #Converting the weights dictionary into a dataframe for easier application in next parts of the code

#Displaying results
print("\nScenario weights:")
print(weights_df)
print("\nColumn sums:")
print(weights_df.sum(axis=0))


Scenario weights:
                            Scenario_1_equal  Scenario_2_data_code  \
Data availability                   0.142857              0.241379   
Inclusion of a ReadMe               0.142857              0.103448   
Trained model inclusion             0.142857              0.103448   
Hyperparameter description          0.142857              0.103448   
Training description                0.142857              0.103448   
Paper readability                   0.142857              0.103448   
Code availability                   0.142857              0.241379   

                            Scenario_3_read_train  
Data availability                        0.107143  
Inclusion of a ReadMe                    0.107143  
Trained model inclusion                  0.035714  
Hyperparameter description               0.142857  
Training description                     0.250000  
Paper readability                        0.250000  
Code availability                        0.107143  

Col

Computing WSM scores for different scenarios:

In [26]:
#Looping through each scenario and its corresponding weight vector
for name, w in weights.items():
    df_norm[f"WSM_{name}"] = df_norm[factors].values @ w 
    
#Computing the WSM score for each row: WSM = sum of (criteria value Ã— corresponding weight). The @ operator performs matrix multiplication

Randomly sample 10 papers from the new annotated dataset:

In [28]:
#Randomly sample 10 papers from the new annotated dataset. it was chosen to sample 10 research papers that were reproducible, 
#according to the reproducibility factor from D. Olszewski, et al. \cite{olszewski2023get}, because this subset allows for observing 
#a wider range of variability in reproducibility scores under different weight assignment procedure specifications.

sample_df = (df_norm[df_norm[annotated_col] == 1].sample(n=10, random_state=42).reset_index(drop=True))

#Save as a csv file
sample_df.to_csv("10_annotated_papers.csv", index=False)

#Display results
cols = ["Paper number", "Paper Title"] + [f"WSM_{k}" for k in weights] #WSM score for each weight scenario with Paper number and title
print(sample_df[cols])

   Paper number                                        Paper Title  \
0           271  WTAGRAPH: Web Tracking and Advertising Detecti...   
1           461  DeepSteal: Advanced Model Extractions Leveragi...   
2           269  Model Stealing Attacks Against Inductive Graph...   
3           613  Phishpedia: A Hybrid Deep Learning Based Appro...   
4           552  Cheetah: Lean and Fast Secure Two-Party Deep N...   
5           459  DEEPCASE: Semi-Supervised Contextual Analysis ...   
6           556  Khaleesi: Breaker of Advertising and Tracking ...   
7           257  AI/ML for Network Security: The Emperor has no...   
8           740  Improving Password Guessing via Representation...   
9            80  Privacy Risks of Securing Machine Learning Mod...   

   WSM_Scenario_1_equal  WSM_Scenario_2_data_code  WSM_Scenario_3_read_train  
0              0.285714                  0.206897                   0.446429  
1              0.428571                  0.448276                   0.5