# Code used to generate tables about running R scripts without RaaS

In [1]:
import os
import sqlite3

import pandas as pd

## Data Collection

The collected data from the evaluation where we tried to execute R scripts without RaaS is stored in the database called 'results.db'

In [2]:
con = sqlite3.connect("../data/results.db")

errors_df = pd.read_sql_query("SELECT * FROM results", con) 

## Functions used during analysis 

In [3]:
# This function is used to take the processed filename-friendly version of
# a dataset's DOI and return the original DOI
def get_doi_from_filename(filename):
    filename = filename[2:len(filename)]
    doi = os.path.split(filename)[0]
    doi = doi.replace("-", ":", 1)
    doi = doi.replace("-", "/")
    return(doi)

# This function categorizes error messages by searching for the most unique and common phrases in different types of R error messages
def determine_error_cause(error_msg):
    ret_val = "other"
    
    if("Error in setwd" in error_msg):
        ret_val = "working directory"
    elif("Error in library" in error_msg):
        ret_val = "library"
    elif("Error in file" in error_msg):
        ret_val = "missing file"
    elif("unable to open" in error_msg):
        ret_val = "missing file"
    elif("Error in readChar" in error_msg):
        ret_val = "missing file"
    elif("could not find function" in error_msg):
        ret_val = "function"
    elif("there is no package called" in error_msg):
        ret_val = "library"
    elif("cannot open the connection" in error_msg):
        ret_val = "missing file"
        
    return(ret_val)

# This function is used when viewing the results of this analysis in the notebook
def print_error_breakdown(error_type, num_of_errors):
    print(error_type + " errors: " + str(error_breakdown[error_type]) + ", or " + str((error_breakdown[error_type] / num_of_errors) * 100) + "% of total errors")

## Analysis 

This cell analyzes each error stored in the database and uses the previously defined determine_error_cause function to categorize each as either library, working directory, missing file, missing function, or other. It also calculates how many datasets were "clean," meaning all the scripts executed within it ran with no errors. 

In [4]:
total_error_results ={"Errors" : 0, "No Errors": 0, "Clean Datasets": 0}
error_breakdown = {"library": 0, "working directory": 0, "missing file": 0, "function":0, "other": 0}

last_doi = None
current_doi_clean = True
for index, row in errors_df.iterrows():
    doi = get_doi_from_filename(row["filename"])
    if(doi != last_doi):
        if(current_doi_clean == True):
            total_error_results["Clean Datasets"] += 1
        current_doi_clean = True
    if(row["error"] == "success"):
        total_error_results["No Errors"] += 1
    else:
        total_error_results["Errors"] += 1
        current_doi_clean = False
        error_breakdown[determine_error_cause(row["error"])] += 1
        
    last_doi = doi

# View Results

This cell prints the results of the evaluation performed checking for errors without RaaS.

The publication presents this information in a table generated below.

In [5]:
total_num_of_scripts = total_error_results["Errors"] + total_error_results["No Errors"]
print("Out of " + str(total_num_of_scripts) + " total scripts")
print("Scripts without errors: " + str(total_error_results['No Errors']) + ", or " + str((total_error_results['No Errors'] / total_num_of_scripts) * 100) + "% of total scripts")
print("Scripts with errors: " + str(total_error_results['Errors']) + ", or " + str((total_error_results['Errors'] / total_num_of_scripts) * 100) + "% of total scripts")
print("Number of clean datasets: " + str(total_error_results["Clean Datasets"]) + " out of " + str(len(errors_df.index)))
print("##############################################################\n\n")
print("Error Breakdown: ")
print_error_breakdown("library", total_error_results['Errors'])
print_error_breakdown("working directory", total_error_results['Errors'])
print_error_breakdown("missing file", total_error_results['Errors'])
print_error_breakdown("function", total_error_results['Errors'])
print_error_breakdown("other", total_error_results['Errors'])

Out of 10289 total scripts
Scripts without errors: 1066, or 10.360579259403247% of total scripts
Scripts with errors: 9223, or 89.63942074059675% of total scripts
Number of clean datasets: 62 out of 10289
##############################################################


Error Breakdown: 
library errors: 5570, or 60.39249701832375% of total errors
working directory errors: 1268, or 13.748238100401172% of total errors
missing file errors: 854, or 9.25946004553833% of total errors
function errors: 614, or 6.657269868806245% of total errors
other errors: 917, or 9.9425349669305% of total errors


# Generate latex table

This cell will take the results computed previously and write the latex table used in the publication

In [6]:
no_raas_error_data = []
for error_type in error_breakdown.keys():
    no_raas_error_data.append({
        "Error Type": error_type.capitalize(), 
        "Count": error_breakdown[error_type], 
        "Percentage (Rounded)": round((error_breakdown[error_type] / total_error_results['Errors']) * 100, 1),
    })
    
no_raas_error_data_df = pd.DataFrame(no_raas_error_data, columns=["Error Type", "Count", "Percentage (Rounded)"])

with open("../results/no_raas_error_data.tex", "w") as no_raas_error_file:
    no_raas_error_file.write(no_raas_error_data_df.to_latex())