In [1]:
import arctrl,os, frictionless, re
from junit_xml import TestSuite, TestCase

def validateUsingIsamapData(isamapFilename):
    datamapTestcase = TestCase(isamapFilename, "Arctrl", allow_multiple_subelements=True)
    try:
        basePath = os.path.dirname(os.path.abspath(isamapFilename))
        datamap = arctrl.XlsxController.Datamap().from_xlsx_file(isamapFilename)
    except Exception as ex:
        datamapTestcase.add_error_info(str(ex),None,type(ex).__name__)
    #only if datamap was parsed succesfully
    entries = {}
    if not datamapTestcase.is_error():        
        patternColumn = re.compile("^col=([0-9]+)+$")
        for datacontext in datamap.DataContexts:
            if datacontext.Format == "text/csv":   
                #check existence
                if not datacontext.FilePath in entries:
                    fileLocation = os.path.join("dataset",datacontext.FilePath)
                    entries[datacontext.FilePath] = {
                            "location": fileLocation,
                            "testcase": TestCase(fileLocation, "Frictionless", allow_multiple_subelements=True)
                        }
                    try:
                        if os.path.exists(os.path.join(basePath,fileLocation)):
                            #create automatically resource with schema
                            automaticResource = frictionless.describe(fileLocation, basepath=basePath)
                            #update type to any
                            for i in range(len(automaticResource.schema.fields)):
                                fieldName = automaticResource.schema.fields[i].name
                                automaticResource.schema.fields[i] = frictionless.fields.AnyField(name=fieldName)
                            #update entries
                            entries[datacontext.FilePath]["resource"] = automaticResource
                        else:
                            raise FileNotFoundError(datacontext.FilePath)
                    except Exception as ex:
                        entries[datacontext.FilePath]["testcase"].add_error_info(str(ex),None,type(ex).__name__)
                #if exists without errors
                if not entries[datacontext.FilePath]["testcase"].is_error():
                    try:
                        if datacontext.SelectorFormat == "https://datatracker.ietf.org/doc/html/rfc7111":
                            if (colMatch := patternColumn.match(datacontext.Selector)):
                                column = int(colMatch.group(1))
                                value = datacontext.GetValue()
                                fieldName = entries[datacontext.FilePath]["resource"].schema.fields[column].name
                                if value.IsAnOntology:
                                    if value.Text in ["double","decimal","float"]:                            
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.StringField(name=fieldName)
                                    elif value.Text in ["integer", "long", "int", "short", "byte"]:                            
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.IntegerField(
                                            name=fieldName)
                                    elif value.Text in ["positiveInteger"]:                            
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.IntegerField(
                                            name=fieldName, constraints={"minimum": 1})
                                    elif value.Text in ["nonNegativeInteger", "unsignedLong", "unsignedInt", "unsignedShort", "unsignedByte"]:                            
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.IntegerField(
                                            name=fieldName, constraints={"minimum": 0})
                                    elif value.Text in ["nonPositiveInteger"]:                            
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.IntegerField(
                                            name=fieldName, constraints={"maximum": 0})
                                    elif value.Text in ["negativeInteger"]:                            
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.IntegerField(
                                            name=fieldName, constraints={"maximum": -1})
                                    elif value.Text in ["string", "normalizedString", "token"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.StringField(
                                            name=fieldName)
                                    elif value.Text in ["language"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.StringField(
                                            name=fieldName, constraints={"pattern": "[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*"})
                                    elif value.Text in ["boolean"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.BooleanField(name=fieldName)
                                    elif value.Text in ["dateTime"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.DateTimeField(name=fieldName)
                                    elif value.Text in ["time"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.TimeField(name=fieldName)
                                    elif value.Text in ["date"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.DateField(name=fieldName)
                                    elif value.Text in ["gYear"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.YearField(name=fieldName)
                                    elif value.Text in ["gYearMonth"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.YearmonthField(name=fieldName)
                                    elif value.Text in ["duration", "yearMonthDuration", "dayTimeDuration"]:
                                        entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.DurationField(name=fieldName)
                                    else:
                                        raise NotImplementedError(value.Text)
                                elif IsAnInt:
                                    entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.IntegerField(name=fieldName)
                                elif IsAFloat:
                                    entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.NumberField(name=fieldName)
                                elif value.IsNumerical:
                                    entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.NumberField(name=fieldName) 
                                elif value.IsText:
                                    entries[datacontext.FilePath]["resource"].schema.fields[column] = frictionless.fields.StringField(name=fieldName) 
                                else:
                                    raise NotImplementedError(value)
                        else:
                            raise NotImplementedError(datacontext.SelectorFormat)
                    except Exception as ex:
                        entries[datacontext.FilePath]["testcase"].add_error_info(str(ex),None,type(ex).__name__)
            else:
                # raise NotImplementedError(datacontext.Format)
                pass
        for id, entry in entries.items():
            try:
                validationResult = entry["resource"].validate()
                if not validationResult.valid:
                    for task in validationResult.tasks:
                        for error in task.errors:
                            entry["testcase"].add_failure_info(error.description, error.message, error.type)            
            except Exception as ex:
                entries[datacontext.FilePath]["testcase"].add_error_info(str(ex),None,type(ex).__name__)
    ts = TestSuite(isamapFilename,[datamapTestcase] + [entry["testcase"] for entry in entries.values()])
    return TestSuite.to_xml_string([ts])

In [2]:
#using ArcPrototype from https://git.nfdi4plants.org/muehlhaus/ArcPrototype/
print(validateUsingIsamapData("ArcPrototype/assays/measurement1/isa.datamap.xlsx"))

<?xml version="1.0" ?>
<testsuites disabled="0" errors="0" failures="0" tests="2" time="0.0">
	<testsuite disabled="0" errors="0" failures="0" name="ArcPrototype/assays/measurement1/isa.datamap.xlsx" skipped="0" tests="2" time="0">
		<testcase name="ArcPrototype/assays/measurement1/isa.datamap.xlsx" classname="Arctrl"/>
		<testcase name="dataset/proteomics_result.csv" classname="Frictionless"/>
	</testsuite>
</testsuites>



In [3]:
#Detect problems with datamap parsing
print(validateUsingIsamapData("incorrect/location/isa.datamap.xlsx"))

<?xml version="1.0" ?>
<testsuites disabled="0" errors="1" failures="0" tests="1" time="0.0">
	<testsuite disabled="0" errors="1" failures="0" name="incorrect/location/isa.datamap.xlsx" skipped="0" tests="1" time="0">
		<testcase name="incorrect/location/isa.datamap.xlsx" classname="Arctrl">
			<error type="FileNotFoundError" message="[Errno 2] No such file or directory: 'incorrect/location/isa.datamap.xlsx'"/>
		</testcase>
	</testsuite>
</testsuites>



In [4]:
#Test error.1
print(validateUsingIsamapData("ArcPrototype/assays/measurement1/isa.datamap.with.error.test1.xlsx"))

<?xml version="1.0" ?>
<testsuites disabled="0" errors="1" failures="0" tests="2" time="0.0">
	<testsuite disabled="0" errors="1" failures="0" name="ArcPrototype/assays/measurement1/isa.datamap.with.error.test1.xlsx" skipped="0" tests="2" time="0">
		<testcase name="ArcPrototype/assays/measurement1/isa.datamap.with.error.test1.xlsx" classname="Arctrl"/>
		<testcase name="dataset/proteomics_result.csv" classname="Frictionless">
			<error type="NotImplementedError" message="doubles"/>
		</testcase>
	</testsuite>
</testsuites>



In [5]:
#Test error.2
print(validateUsingIsamapData("ArcPrototype/assays/measurement1/isa.datamap.with.error.test2.xlsx"))

<?xml version="1.0" ?>
<testsuites disabled="0" errors="0" failures="1" tests="2" time="0.0">
	<testsuite disabled="0" errors="0" failures="1" name="ArcPrototype/assays/measurement1/isa.datamap.with.error.test2.xlsx" skipped="0" tests="2" time="0">
		<testcase name="ArcPrototype/assays/measurement1/isa.datamap.with.error.test2.xlsx" classname="Arctrl"/>
		<testcase name="dataset/proteomics_result.csv" classname="Frictionless">
			<failure type="type-error" message="The value does not match the schema type and format for this field.">Type error in the cell &quot;2.51972112&quot; in row &quot;2&quot; and field &quot;quant_2&quot; at position &quot;5&quot;: type is &quot;integer/default&quot;</failure>
			<failure type="type-error" message="The value does not match the schema type and format for this field.">Type error in the cell &quot;3.04179784&quot; in row &quot;3&quot; and field &quot;quant_2&quot; at position &quot;5&quot;: type is &quot;integer/default&quot;</failure>
			<failure t