In [1]:
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.runBlocking
import com.google.common.hash.Hashing
import io.github.cdimascio.dotenv.Dotenv
import java.nio.file.Paths

%useLatestDescriptors
%use lets-plot

class Main

val dotenv = Dotenv.load()
val dataDir = dotenv.get("DATA_DIR").let { Paths.get(it).toFile() }.also { it.mkdirs() }
dataDir

/home/j.zeidler@GDAAG.DE/private/ba3/bachelor-thesis/data

In [2]:
import org.bson.BsonDocument
import org.litote.kmongo.*

val client = KMongo.createClient("mongodb://localhost:42692/")
val db = client.getDatabase("s5_snyk_libio")
val vulnCollection = db.getCollection<BsonDocument>("mergedVuln")
val vulnClientCollection = db.getCollection<BsonDocument>("mergedVulnClients")

In [3]:
import com.mongodb.client.*

private class MongoCursorIterable<T>(private val cursor: MongoCursor<T>) : MongoCursor<T> by cursor, Iterable<T> {

    override fun iterator(): Iterator<T> = cursor
}

private fun <T> MongoIterable<T>.kCursor(): MongoCursorIterable<T> = MongoCursorIterable(iterator())

fun <T, R> MongoIterable<T>.useCursor(block: (Iterable<T>) -> R): R {
    return kCursor().use(block)
}

In [4]:
import org.nield.kotlinstatistics.median

val vulnCveGavToClasses = vulnCollection.find().useCursor { blk ->
    blk.map {
        val gav = it["vuln_gav"]!!.asString().value
        val cve = it["cve_ref"]!!.asString().value.let { c -> if (c.isBlank()) it["snyk_url"]!!.asString().value else c }
        val vulnClasses = it["vuln_classes"]!!.asArray().map { it.asString().value }.toSet()

        if (vulnClasses.count() == 0) throw Exception("no vuln class (record should have been ommitted previously)")

        (cve to gav) to vulnClasses
    }
}.groupBy { it.first }.map { g -> g.key to g.value.flatMap { it -> it.second }.toSet() }.toMap()
    
println(vulnCveGavToClasses.count())
println(vulnCveGavToClasses.map { it.value.size }.minOrNull())
println(vulnCveGavToClasses.map { it.value.size }.maxOrNull())
println(vulnCveGavToClasses.map { it.value.size }.average())
println(vulnCveGavToClasses.map { it.value.size }.median())

7618
1
51
2.0490942504594383
1.0


In [5]:
val vulnCveGavPairToClasses = vulnClientCollection.find().useCursor { blk ->
    blk.map {
        val dep_gav = it["dep_gav"]!!.asString().value
        val client_gav = it["client_gav"]!!.asString().value
        val cve = it["cve"]!!.asString().value.let { c -> if (c.isBlank()) it["snyk_url"]!!.asString().value else c }
        val vulnClasses = vulnCveGavToClasses[cve to dep_gav]!!
        
        if (vulnClasses.count() == 0) throw Exception("no vuln class (record should have been ommitted previously)")

        Triple(cve, dep_gav, client_gav) to vulnClasses
    }
}.groupBy { it.first }.map { g -> g.key to g.value.flatMap { it -> it.second }.toSet() }.toMap()
    
println(vulnCveGavPairToClasses.count())
println(vulnCveGavPairToClasses.map { it.value.size }.minOrNull())
println(vulnCveGavPairToClasses.map { it.value.size }.maxOrNull())
println(vulnCveGavPairToClasses.map { it.value.size }.average())
println(vulnCveGavPairToClasses.map { it.value.size }.median())
println(vulnCveGavPairToClasses.keys.map { it.second }.toSet().count()) // are no client for some of the libs in our dataset

85889
1
42
2.0847605630523116
1.0
2290


In [6]:
import common.DefaultGraph
import io.github.classgraph.ClassInfoList
import java.io.File
import java.net.URLClassLoader
import org.jgrapht.Graph
import org.jgrapht.graph.DefaultEdge
import org.jgrapht.graph.builder.GraphTypeBuilder

fun loadDepGraph(gav: String): DefaultGraph = 
    scripts.exportDepGraphs.loadDepGraphFromCache(gav)

fun loadClassListInfo(gav: String): ClassInfoList =
    scripts.exportDepGraphs.loadClassInfoListFromCache(gav)

fun loadVertexInfo(gav: String) =
    scripts.exportDepGraphs.loadVertexInfo(gav)

fun loadDepGraph(depGav: String, clientGav: String): DefaultGraph = 
    scripts.exportPairDepGraphs.loadDepGraphFromCache(depGav, clientGav)

fun loadClassListInfo(depGav: String, clientGav: String): ClassInfoList =
    scripts.exportPairDepGraphs.loadClassInfoListFromCache(depGav, clientGav)

fun loadVertexInfo(depGav: String, clientGav: String) =
    scripts.exportPairDepGraphs.loadVertexInfo(depGav, clientGav)

In [7]:
import com.google.common.collect.Queues

/**
 * returns the depths of each visited node from the starting node
 */
fun bfsOnDepGraph(graph: DefaultGraph, startNode: String): Map<String, Int> {
    val expanded = mutableSetOf<String>()
    val depthMap = mutableMapOf<String, Int>()
    val queue = Queues.newArrayDeque<String>()
    queue.add(startNode)
    depthMap[startNode] = 0
    while (!queue.isEmpty()) {
        val node = queue.pop()
        expanded.add(node)
        val parentDepth = depthMap[node]!! 
        graph
            .outgoingEdgesOf(node)
            .map { e -> graph.getEdgeTarget(e) }
            .filterNot { expanded.contains(it) }
            .forEach { v ->
                if (depthMap.containsKey(v)) {
                    depthMap[v] = min(depthMap[v]!!, parentDepth + 1)
                } else {
                    depthMap[v] = parentDepth + 1;
                }
                queue.add(v)
            }
    }
    return depthMap
}

In [8]:
import common.DefaultGraph
import io.github.classgraph.ClassGraph
import io.github.classgraph.ClassInfo
import io.github.classgraph.ClassInfoList
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.async
import kotlinx.coroutines.runBlocking
import org.jgrapht.alg.shortestpath.FloydWarshallShortestPaths
import org.jgrapht.alg.shortestpath.GraphMeasurer
import org.jgrapht.graph.EdgeReversedGraph
import org.jgrapht.traverse.BreadthFirstIterator
import java.lang.reflect.Modifier
import kotlin.Exception

data class VulnAnalysisInfo(
    val depClassCount: Int,
    val depVulnClassCount: Int,
    val clientClassCount: Int,
    val depPublicClassCount: Int,
    val clientPublicClassCount: Int,
    val clientVulnClassCount: Int,
    val clientVulnPublicClassCount: Int,
    val depVulnPublicClassCount: Int,
    val referenedPublicClassCount: Int,
    val referenedVulnPublicClassCount: Int,
) {
    val depVulnRatio = depVulnClassCount.toDouble() / depClassCount.toDouble()
    val depApiSurfaceRatio = depPublicClassCount.toDouble() / depClassCount.toDouble()
    val depVulnApiSurfaceRatio = depVulnPublicClassCount.toDouble() / depPublicClassCount.toDouble()

    val clientVulnRatio = clientVulnClassCount.toDouble() / clientClassCount.toDouble()
    val clientApiSurfaceRatio = clientPublicClassCount.toDouble() / clientClassCount.toDouble()
    val clientVulnApiSurfaceRatio = clientVulnPublicClassCount.toDouble() / clientPublicClassCount.toDouble()
}

System.gc()

@kotlinx.coroutines.ExperimentalCoroutinesApi fun _vulnInfo() =
    Dispatchers.IO.limitedParallelism(32).let { dispatcher ->
        vulnCveGavPairToClasses.asSequence().windowed(512, 512, true).flatMapIndexed { batch, w ->
            runBlocking {
                System.gc()
                println("processing batch $batch")
                w.map {
                    async(dispatcher) {
                        val depGav = it.key.second
                        val clientGav = it.key.third
                        val vulnClasses = it.value

                        val depGraph = loadDepGraph(depGav, clientGav)

                        val vertexInfo = loadVertexInfo(depGav, clientGav)
                        val depVertexInfo = loadVertexInfo(depGav)
                        val clientVertexInfo = vertexInfo.filterNot { depVertexInfo.containsKey(it.key) }.toMap()

                        val depPublicClasses =
                            depGraph
                                .vertexSet()
                                .filter { v -> depVertexInfo.containsKey(v) && Modifier.isPublic(vertexInfo[v]!!) }

                        val publicApiUsages = depGraph.edgeSet().map { e ->
                            val s = depGraph.getEdgeSource(e)
                            val t = depGraph.getEdgeTarget(e)
                            if (clientVertexInfo.keys.contains(s) && depVertexInfo.keys.contains(t)) s to t
                            else null
                        }.filterNotNull()

                        val depPublicClassesUsedByClient = publicApiUsages.map { it.second }.toSet()

                        val vulnDepPublicClassesUsedByClient = depPublicClassesUsedByClient.intersect(vulnClasses)

                        val clientVulnRoots =
                            publicApiUsages
                                .filter { vulnDepPublicClassesUsedByClient.contains(it.second) }
                                .map { it.first }

                        val distToNearestVuln = mutableMapOf<String, Int>()
                        vulnClasses.map {
                            bfsOnDepGraph(EdgeReversedGraph(depGraph), it)
                        }.forEach { m ->
                            m.forEach { v, d ->
                                if (distToNearestVuln.containsKey(v)) {
                                    distToNearestVuln[v] = min(distToNearestVuln[v]!!, d)
                                } else {
                                    distToNearestVuln[v] = d
                                }
                            }
                        }

                        val allVulnClasses = distToNearestVuln.keys.toSet()
                        val allClientVulnClasses = allVulnClasses.intersect(clientVertexInfo.keys)
                        val clientPublicClasses = clientVertexInfo.keys.filter { v ->
                            Modifier.isPublic(clientVertexInfo[v]!!)
                        }
                        val publicClientVulnClasses = allClientVulnClasses.intersect(clientPublicClasses)

                        val depVulnClasses = distToNearestVuln.keys.filter { v ->
                            depVertexInfo.containsKey(v)
                        }.toSet()

                        val depVulnPublicClasses = distToNearestVuln.keys.filter { v ->
                            depVertexInfo.containsKey(v) && Modifier.isPublic(depVertexInfo[v]!!)
                        }.toSet()

                        val res = VulnAnalysisInfo(
                            depClassCount = depVertexInfo.keys.size,
                            depPublicClassCount = depPublicClasses.size,
                            depVulnPublicClassCount = depVulnPublicClasses.size,
                            depVulnClassCount = depVulnClasses.size,
                            clientClassCount = clientVertexInfo.keys.size,
                            clientPublicClassCount = clientPublicClasses.size,
                            clientVulnClassCount = allClientVulnClasses.size,
                            clientVulnPublicClassCount = publicClientVulnClasses.size,
                            referenedPublicClassCount = depPublicClassesUsedByClient.size,
                            referenedVulnPublicClassCount = vulnDepPublicClassesUsedByClient.size,
                        )

                        return@async Result.success(res)
                    }
                }.toList().awaitAll().toList()
            }.toList().filter { it.isSuccess }.map { it.getOrNull()!! }.toList()
        }.toList()
    }

val vulnInfo = _vulnInfo()
vulnInfo.count()

processing batch 0
processing batch 1
processing batch 2
processing batch 3
processing batch 4
processing batch 5
processing batch 6
processing batch 7
processing batch 8
processing batch 9
processing batch 10
processing batch 11
processing batch 12
processing batch 13
processing batch 14
processing batch 15
processing batch 16
processing batch 17
processing batch 18
processing batch 19
processing batch 20
processing batch 21
processing batch 22
processing batch 23
processing batch 24
processing batch 25
processing batch 26
processing batch 27
processing batch 28
processing batch 29
processing batch 30
processing batch 31
processing batch 32
processing batch 33
processing batch 34
processing batch 35
processing batch 36
processing batch 37
processing batch 38
processing batch 39
processing batch 40
processing batch 41
processing batch 42
processing batch 43
processing batch 44
processing batch 45
processing batch 46
processing batch 47
processing batch 48
processing batch 49
processing

85889

In [9]:
fun Double.format(digits: Int) =
    "%.${digits}f".format(this)

fun Float.format(digits: Int) =
    "%.${digits}f".format(this)

In [10]:
println("depClassCount median: ${vulnInfo.map { it.depClassCount }.median()} avg: ${vulnInfo.map { it.depClassCount }.average().format(2)}")
println("depClassCount min: ${vulnInfo.map { it.depClassCount }.min()} max: ${vulnInfo.map { it.depClassCount }.max()}")
println("depPublicClassCount median: ${vulnInfo.map { it.depPublicClassCount }.median()} avg: ${vulnInfo.map { it.depPublicClassCount }.average().format(2)}")
println("depVulnClassCount median: ${vulnInfo.map { it.depVulnClassCount }.median()} avg: ${vulnInfo.map { it.depVulnClassCount }.average().format(2)}")
println("depVulnPublicClassCount median: ${vulnInfo.map { it.depVulnPublicClassCount }.median()} avg: ${vulnInfo.map { it.depVulnPublicClassCount }.average().format(2)}")
println("depApiSurfaceRatio median: ${vulnInfo.map { it.depApiSurfaceRatio * 100 }.median().format(2)} avg: ${vulnInfo.map { it.depApiSurfaceRatio * 100 }.average().format(2)}")
println("depVulnApiSurfaceRatio median: ${vulnInfo.map { it.depVulnApiSurfaceRatio * 100 }.median().format(2)} avg: ${vulnInfo.map { it.depVulnApiSurfaceRatio * 100 }.average().format(2)}")

depClassCount median: 381.0 avg: 479.49
depClassCount min: 1 max: 21910
depPublicClassCount median: 317.0 avg: 434.29
depVulnClassCount median: 12.0 avg: 109.16
depVulnPublicClassCount median: 12.0 avg: 103.88
depApiSurfaceRatio median: 95.65 avg: 90.62
depVulnApiSurfaceRatio median: 3.75 avg: 22.70


In [11]:
import kotlin.Double.Companion

println("clientClassCount median: ${vulnInfo.map { it.clientClassCount }.median()} avg: ${vulnInfo.map { it.clientClassCount }.average().format(2)}")
println("clientPublicClassCount median: ${vulnInfo.map { it.clientPublicClassCount }.median()} avg: ${vulnInfo.map { it.clientPublicClassCount }.average().format(2)}")
println("clientVulnClassCount median: ${vulnInfo.map { it.clientVulnClassCount }.median()} avg: ${vulnInfo.map { it.clientVulnClassCount }.average().format(2)}")
println("clientVulnPublicClassCount median: ${vulnInfo.map { it.clientVulnPublicClassCount }.median()} avg: ${vulnInfo.map { it.clientVulnPublicClassCount }.average().format(2)}")
println("clientApiSurfaceRatio median: ${vulnInfo.filterNot { it.clientApiSurfaceRatio.isNaN() }.map { it.clientApiSurfaceRatio * 100 }.median().format(2)} avg: ${vulnInfo.filterNot { it.clientApiSurfaceRatio.isNaN() }.map { it.clientApiSurfaceRatio * 100 }.average().format(2)}")
println("clientVulnApiSurfaceRatio median: ${vulnInfo.filterNot { it.clientVulnApiSurfaceRatio.isNaN() }.map { it.clientVulnApiSurfaceRatio * 100 }.median().format(2)} avg: ${vulnInfo.filterNot { it.clientVulnApiSurfaceRatio.isNaN() }.map { it.clientVulnApiSurfaceRatio * 100 }.average().format(2)}")

clientClassCount median: 19.0 avg: 143.16
clientPublicClassCount median: 18.0 avg: 133.58
clientVulnClassCount median: 0.0 avg: 10.24
clientVulnPublicClassCount median: 0.0 avg: 9.81
clientApiSurfaceRatio median: 100.00 avg: 95.74
clientVulnApiSurfaceRatio median: 0.00 avg: 11.11


In [12]:
val safeClientCount = vulnInfo.filter { it.clientVulnClassCount == 0 }.count()
val vulnClientCount = vulnInfo.filter { it.clientVulnClassCount > 0 }.count()
println("safe clients: ${safeClientCount} ratio: ${(safeClientCount.toDouble() / vulnInfo.count()).format(2)}")
println("vuln clients: ${vulnClientCount} ratio: ${(vulnClientCount.toDouble() / vulnInfo.count()).format(2)}")

safe clients: 56809 ratio: 0.66
vuln clients: 29080 ratio: 0.34


In [13]:
val nonSafeClientsInfo = vulnInfo.filter { it.clientPublicClassCount > 0 && it.clientVulnClassCount > 0 }
nonSafeClientsInfo.count()

29067

In [14]:
println("clientClassCount median: ${nonSafeClientsInfo.map { it.clientClassCount }.median()} avg: ${nonSafeClientsInfo.map { it.clientClassCount }.average().format(2)}")
println("clientPublicClassCount median: ${nonSafeClientsInfo.map { it.clientPublicClassCount }.median()} avg: ${nonSafeClientsInfo.map { it.clientPublicClassCount }.average().format(2)}")
println("clientVulnClassCount median: ${nonSafeClientsInfo.map { it.clientVulnClassCount }.median()} avg: ${nonSafeClientsInfo.map { it.clientVulnClassCount }.average().format(2)}")
println("clientVulnPublicClassCount median: ${nonSafeClientsInfo.map { it.clientVulnPublicClassCount }.median()} avg: ${nonSafeClientsInfo.map { it.clientVulnPublicClassCount }.average().format(2)}")
println("clientApiSurfaceRatio median: ${nonSafeClientsInfo.map { it.clientApiSurfaceRatio * 100 }.median().format(2)} avg: ${nonSafeClientsInfo.map { it.clientApiSurfaceRatio * 100 }.average().format(2)}")
println("clientVulnApiSurfaceRatio median: ${nonSafeClientsInfo.map { it.clientVulnApiSurfaceRatio * 100 }.median().format(2)} avg: ${nonSafeClientsInfo.map { it.clientVulnApiSurfaceRatio * 100 }.average().format(2)}")

clientClassCount median: 32.0 avg: 186.67
clientPublicClassCount median: 31.0 avg: 174.75
clientVulnClassCount median: 4.0 avg: 30.27
clientVulnPublicClassCount median: 3.0 avg: 28.98
clientApiSurfaceRatio median: 100.00 avg: 95.52
clientVulnApiSurfaceRatio median: 21.05 avg: 32.07


In [15]:
import org.jetbrains.kotlinx.dataframe.math.median

println("(vulnInfo) referenedPublicClassCount median: ${vulnInfo.map { it.referenedPublicClassCount }.median()} average: ${vulnInfo.map { it.referenedPublicClassCount }.average()}")
println("referenedPublicClassCount median: ${nonSafeClientsInfo.map { it.referenedPublicClassCount }.median()} average: ${nonSafeClientsInfo.map { it.referenedPublicClassCount }.average()}")
println("referenedVulnPublicClassCount median: ${nonSafeClientsInfo.map { it.referenedVulnPublicClassCount * 100 }.median()} average: ${nonSafeClientsInfo.map { it.referenedVulnPublicClassCount }.average()}")

(vulnInfo) referenedPublicClassCount median: 2 average: 6.618600752133568
referenedPublicClassCount median: 5 average: 12.061753878969277
referenedVulnPublicClassCount median: 0 average: 0.3023703856607149


In [16]:
val p =
    letsPlot(
        mapOf("clientClassCount" to vulnInfo.filter{it.clientClassCount > 5_000}.map { it.clientClassCount })
    ) { x = "clientClassCount" } + ggsize(1800, 800) +
            geomHistogram(alpha = .3, binWidth = 10, center = 5) 
            // xlim(limits = Pair(0, 500))

p

In [17]:
val p =
    letsPlot(
        mapOf("depClassCount" to vulnInfo.filter{it.depClassCount > 5_000}.map { it.depClassCount })
    ) { x = "depClassCount" } + ggsize(1800, 800) +
            geomHistogram(alpha = .3, binWidth = 10, center = 5) 
            // xlim(limits = Pair(0, 500))

p

In [18]:
ggsave(p, "/tmp/hist.svg")
ggsave(p, "/tmp/hist.png")

/tmp/hist.png

In [19]:
val p =
    letsPlot(
        mapOf("clientClassCount" to vulnInfo.map { it.clientClassCount.coerceAtMost(490) })
    ) { x = "clientClassCount" } + ggsize(1800, 800) +
            geomHistogram(alpha = .3, binWidth = 10, center = 5) +
            xlim(limits = Pair(0, 500))

p

In [20]:
val p =
    letsPlot(
        mapOf("clientApiSurfaceRatio" to vulnInfo.map { it.clientApiSurfaceRatio * 100 })
    ) { x = "clientApiSurfaceRatio" } + ggsize(1800, 800) +
            geomHistogram(alpha = .3, binWidth = 2, center = 1) +
            xlim(limits = Pair(0, 100))

p

In [21]:
val p =
    letsPlot(
        mapOf("clientVulnClassCount" to vulnInfo.map { it.clientVulnClassCount.coerceAtMost(19) })
    ) { x = "clientVulnClassCount" } + ggsize(1800, 800) +
            geomHistogram(alpha = .3, binWidth = 1, center = .5) +
            xlim(limits = Pair(0, 20))

p

In [22]:
val p =
    letsPlot(
        mapOf("clientVulnClassCount" to vulnInfo.filter { it.clientVulnClassCount > 0 }.map { it.clientVulnClassCount.coerceAtMost(500) })
    ) { x = "clientVulnClassCount" } + ggsize(1800, 800) +
            geomHistogram(alpha = .3, binWidth = 1, center = .5) +
            xlim(limits = Pair(0, 501))

p