In [1]:
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.runBlocking
import com.google.common.hash.Hashing
import io.github.cdimascio.dotenv.Dotenv
import java.nio.file.Paths

%useLatestDescriptors
%use lets-plot

class Main

val dotenv = Dotenv.load()
val dataDir = dotenv.get("DATA_DIR").let { Paths.get(it).toFile() }.also { it.mkdirs() }
dataDir

/home/j.zeidler@GDAAG.DE/private/ba3/bachelor-thesis/data

In [2]:
import org.bson.BsonDocument
import org.litote.kmongo.*

val client = KMongo.createClient("mongodb://localhost:42692/")
val db = client.getDatabase("s5_snyk_libio")
val vulnCollection = db.getCollection<BsonDocument>("mergedVuln")

In [3]:
import com.mongodb.client.*

private class MongoCursorIterable<T>(private val cursor: MongoCursor<T>) : MongoCursor<T> by cursor, Iterable<T> {

    override fun iterator(): Iterator<T> = cursor
}

private fun <T> MongoIterable<T>.kCursor(): MongoCursorIterable<T> = MongoCursorIterable(iterator())

fun <T, R> MongoIterable<T>.useCursor(block: (Iterable<T>) -> R): R {
    return kCursor().use(block)
}

In [4]:
import org.bson.BsonNull
import org.bson.BsonString
import org.jetbrains.kotlinx.dataframe.math.mean
import org.jetbrains.kotlinx.dataframe.math.median

val vulnGavToClasses = vulnCollection.find().useCursor { blk ->
    blk.map {
        val gav = it["vuln_gav"]!!.asString().value
        val vulnClasses = it["vuln_classes"]!!.asArray().map { it.asString().value }.toSet()

        if (vulnClasses.count() == 0) throw Exception("no vuln class (record should have been ommitted previously)")

        gav to vulnClasses
    }
}.groupBy { it.first }.map { g -> g.key to g.value.flatMap { it -> it.second }.toSet() }.toMap()

println(vulnGavToClasses.count())
println(vulnGavToClasses.map { it.value.size }.minOrNull())
println(vulnGavToClasses.map { it.value.size }.maxOrNull())
println(vulnGavToClasses.map { it.value.size }.average())
println(vulnGavToClasses.map { it.value.size }.median())

3621
1
51
3.3526650096658384
2


In [5]:
val vulnCveGavToClasses = vulnCollection.find().useCursor { blk ->
    blk.map {
        val gav = it["vuln_gav"]!!.asString().value
        val cve = it["cve_ref"]!!.asString().value.let { c -> if (c.isBlank()) it["snyk_url"]!!.asString().value else c }
        val vulnClasses = it["vuln_classes"]!!.asArray().map { it.asString().value }.toSet()

        if (vulnClasses.count() == 0) throw Exception("no vuln class (record should have been ommitted previously)")

        (cve to gav) to vulnClasses
    }
}.groupBy { it.first }.map { g -> g.key to g.value.flatMap { it -> it.second }.toSet() }.toMap()
    
println(vulnCveGavToClasses.count())
println(vulnCveGavToClasses.map { it.value.size }.minOrNull())
println(vulnCveGavToClasses.map { it.value.size }.maxOrNull())
println(vulnCveGavToClasses.map { it.value.size }.average())
println(vulnCveGavToClasses.map { it.value.size }.median())

6787
1
51
2.038455871519081
1


In [6]:
import scripts.partitionDepGraph.dagP.loadDagpPartitionInfo

val partitionInfo = vulnGavToClasses.keys.map { gav ->
    gav to loadDagpPartitionInfo(gav)
}.toMap()

partitionInfo.count()

3621

In [7]:
val data = mapOf(
    "partition count" to partitionInfo.map { it.second.partitionCount },
)

val p = letsPlot(data) { x = "partition count" } +
        geomHistogram(alpha = .3, binWidth = 1, center = 0.5) +
        xlim(limits = Pair(1, 33)) +
        ggsize(1800, 800) 
p

org.jetbrains.kotlinx.jupyter.exceptions.ReplCompilerException: at Cell In[7], line 1, column 12: Not enough information to infer type variable V
at Cell In[7], line 2, column 23: Not enough information to infer type variable B
at Cell In[7], line 2, column 49: Unresolved reference: second

In [None]:
val depGraphCacheDir = dataDir.resolve("interim/depGraphCache")

val gavToDepGraphCache = vulnGavToClasses
    .map {
        val cachePath = depGraphCacheDir.resolve("${it.key}.vertices.tsv.zip")
        if (!cachePath.isFile()) throw Exception("cache file not found")
        
        it.key to cachePath
    }
    .toMap()
gavToDepGraphCache.count()

In [None]:
import common.DefaultGraph
import io.github.classgraph.ClassInfoList
import java.io.File
import java.net.URLClassLoader
import org.jgrapht.Graph
import org.jgrapht.graph.DefaultEdge
import org.jgrapht.graph.builder.GraphTypeBuilder

fun loadDepGraph(gav: String): DefaultGraph = 
    scripts.exportDepGraphs.loadDepGraphFromCache(gav)

fun loadClassListInfo(gav: String): ClassInfoList =
    scripts.exportDepGraphs.loadClassInfoListFromCache(gav)

fun loadVertexInfo(gav: String) =
    scripts.exportDepGraphs.loadVertexInfo(gav)

In [None]:
import com.google.common.collect.Queues

/**
 * returns the depths of each visited node from the starting node
 */
fun bfsOnDepGraph(graph: DefaultGraph, startNode: String): Map<String, Int> {
    val expanded = mutableSetOf<String>()
    val depthMap = mutableMapOf<String, Int>()
    val queue = Queues.newArrayDeque<String>()
    queue.add(startNode)
    depthMap[startNode] = 0
    while (!queue.isEmpty()) {
        val node = queue.pop()
        expanded.add(node)
        val parentDepth = depthMap[node]!!
        graph
            .outgoingEdgesOf(node)
            .map { e -> graph.getEdgeTarget(e) }
            .filterNot { expanded.contains(it) }
            .forEach { v ->
                if (depthMap.containsKey(v)) {
                    depthMap[v] = min(depthMap[v]!!, parentDepth + 1)
                } else {
                    depthMap[v] = parentDepth + 1;
                }
                queue.add(v)
            }
    }
    return depthMap
}

### how many modules are vuln?

In [None]:
import kotlinx.coroutines.async
import org.jgrapht.graph.EdgeReversedGraph
import org.nield.kotlinstatistics.normalize
import org.nield.kotlinstatistics.standardDeviation
import java.lang.reflect.Modifier

data class VulnAnalysisInfo(
    val classCount: Int,
    val publicClassCount: Int,
    val vulnClassCount: Int,
    val vulnPublicClassCount: Int,
    
    val publicApiInSafeModules: Int,
    val publicApiInVulnModules: Int,
    
    val moduleSizes: List<Int>,
    val safeModuleSizes: List<Int>,
    val vulnModuleSizes: List<Int>,
) {
    val moduleCount = moduleSizes.size
    val safeModuleCount = safeModuleSizes.size
    val vulnModuleCount = vulnModuleSizes.size
    
    val publicApiInSafeModulesRatio = publicApiInSafeModules.toDouble() / publicClassCount.toDouble()
    val publicApiInVulnModulesRatio = publicApiInVulnModules.toDouble() / publicClassCount.toDouble()
    
    val moduleSizesCV = moduleSizes.standardDeviation() / moduleSizes.average()
}

val dispatcher = Dispatchers.IO.limitedParallelism(32)

@kotlinx.coroutines.ExperimentalCoroutinesApi fun _getVulnAnalysisInfo() =
    vulnCveGavToClasses.asSequence().windowed(512, 512, true).flatMapIndexed { batch, w ->
        runBlocking {
            System.gc()
            println("processing batch $batch")
            w.map {
                async(dispatcher) {
                    val gav = it.key.second
                    val vulnClasses = it.value

                    val depGraph = loadDepGraph(gav)
                    val vertexInfo = loadVertexInfo(gav)
                    val depPartitionInfo = partitionInfo[gav]!!
                    
                    if (depPartitionInfo.partToVertices.values.flatten().toSet().size != depGraph.vertexSet().size) {
                        error("failed sanity check")
                    }

                    val publicClasses =
                        depGraph.vertexSet().filter { v -> Modifier.isPublic(vertexInfo[v]!!) }

                    val distToNearestVuln = mutableMapOf<String, Int>()
                    vulnClasses.map {
                        bfsOnDepGraph(EdgeReversedGraph(depGraph), it)
                    }.forEach { m ->
                        m.forEach { v, d ->
                            if (distToNearestVuln.containsKey(v)) {
                                distToNearestVuln[v] = min(distToNearestVuln[v]!!, d)
                            } else {
                                distToNearestVuln[v] = d
                            }
                        }
                    }

                    val allVulnClasses = distToNearestVuln.keys.toSet()
                    val vulnPublicClasses = publicClasses.intersect(allVulnClasses)

                    val vulnParts =
                        depPartitionInfo.partToVertices.filter { (p, vl) ->
                            vl.intersect(allVulnClasses).isNotEmpty()
                        }
                    
                    val classesInVulnParts = vulnParts.values.flatten().toSet()
                    
                    val safeParts = 
                        depPartitionInfo.partToVertices.filterNot { (p, vl) -> 
                            vulnParts.containsKey(p)
                        }
                    
                    val classesInSafeParts = safeParts.values.flatten().toSet()
                    
                    val publicClassesInSafeParts = 
                        publicClasses.intersect(classesInSafeParts)
                    
                    val publicClassesInVulnParts =
                        publicClasses.intersect(classesInVulnParts)
                    
                    if (publicClassesInSafeParts.size + publicClassesInVulnParts.size != publicClasses.size) {
                        error("size mismatch: ${publicClassesInSafeParts.size} + ${publicClassesInVulnParts.size} != ${publicClasses.size}")
                    }
                    
                    VulnAnalysisInfo(
                        classCount = depGraph.vertexSet().size,
                        publicClassCount = publicClasses.size,
                        vulnClassCount = allVulnClasses.size,
                        vulnPublicClassCount = vulnPublicClasses.size,
                        moduleSizes = depPartitionInfo.partToVertices.map { it.value.size }.toList(),
                        safeModuleSizes = safeParts.map { it.value.size }.toList(),
                        vulnModuleSizes = vulnParts.map { it.value.size }.toList(),
                        publicApiInSafeModules = publicClassesInSafeParts.size,
                        publicApiInVulnModules = publicClassesInVulnParts.size,
                    )
                }
            }.toList().awaitAll().toList()  //.filter { it.v != null }
        }.toList()
    }.toList()

val res = _getVulnAnalysisInfo()
res.count()
// println(res.filter { it.msg != null }.map { it.msg!! }.joinToString("\n"))

In [None]:
fun Double.format(digits: Int) =
    "%.${digits}f".format(this)

fun Float.format(digits: Int) =
    "%.${digits}f".format(this)

In [None]:
println("${res.map { it.vulnModuleCount.toDouble() / it.moduleCount.toDouble() }.average()}")
println("${res.map { it.vulnModuleCount.toDouble() / it.moduleCount.toDouble() }.median()}")
println("${res.filter { it.moduleCount > 1 }.map { it.vulnModuleCount.toDouble() / it.moduleCount.toDouble() }.average()}")
println("${res.filter { it.moduleCount > 1 }.map { it.vulnModuleCount.toDouble() / it.moduleCount.toDouble() }.median()}")
println("${res.map { it.safeModuleCount.toDouble() / it.moduleCount.toDouble() }.average()}")
println("${res.map { it.safeModuleCount.toDouble() / it.moduleCount.toDouble() }.median()}")
println("${res.filter { it.moduleCount > 1 }.map { it.safeModuleCount.toDouble() / it.moduleCount.toDouble() }.average()}")
println("${res.filter { it.moduleCount > 1 }.map { it.safeModuleCount.toDouble() / it.moduleCount.toDouble() }.median()}")

In [None]:
println("${res.map { it.vulnModuleSizes.average() / it.classCount }.average()}")

In [None]:
println("${res.map { it.moduleSizesCV }.median()}")
println("${res.map { it.moduleSizesCV }.average()}")

In [None]:
println("${res.map { it.moduleSizes.average() / it.classCount.toDouble() }.average()}")
println("${res.map { it.moduleSizes.median() / it.classCount.toDouble() }.average()}")
println("${res.map { it.moduleSizes.median() / it.classCount.toDouble() }.median()}")
println("${res.map { it.moduleSizes.average() / it.classCount.toDouble() }.median()}")

In [None]:
println("${res.map { it.vulnModuleSizes.average() / it.classCount.toDouble() }.average()}")
println("${res.map { it.vulnModuleSizes.median() / it.classCount.toDouble() }.average()}")
println("${res.map { it.vulnModuleSizes.median() / it.classCount.toDouble() }.median()}")
println("${res.map { it.vulnModuleSizes.average() / it.classCount.toDouble() }.median()}")

In [None]:
println("${res.map { it.publicApiInSafeModulesRatio }.average()}")
println("${res.map { it.publicApiInSafeModulesRatio }.median()}")
println("${res.map { it.publicApiInVulnModulesRatio }.average()}")
println("${res.map { it.publicApiInVulnModulesRatio }.median()}")

In [None]:
val p =
    letsPlot(
        mapOf("module size cv" to res.map { it.moduleSizesCV * 100 })
    ) { x = "module size cv" } + ggsize(1800, 800) +
            geomHistogram(alpha = .3, binWidth = 5, center = 2.5) +
            xlim(limits = Pair(0, 100))

p