In [1]:
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.runBlocking
import com.google.common.hash.Hashing
import io.github.cdimascio.dotenv.Dotenv
import java.nio.file.Paths

%useLatestDescriptors
%use lets-plot

class Main

val dotenv = Dotenv.load()
val dataDir = dotenv.get("DATA_DIR").let { Paths.get(it).toFile() }.also { it.mkdirs() }
dataDir

/home/j.zeidler@GDAAG.DE/private/ba3/bachelor-thesis/data

In [2]:
import org.bson.BsonDocument
import org.litote.kmongo.*

val client = KMongo.createClient("mongodb://localhost:42692/")
val db = client.getDatabase("s5_snyk_libio")
val vulnCollection = db.getCollection<BsonDocument>("mergedVuln")

In [3]:
import com.mongodb.client.*

private class MongoCursorIterable<T>(private val cursor: MongoCursor<T>) : MongoCursor<T> by cursor, Iterable<T> {

    override fun iterator(): Iterator<T> = cursor
}

private fun <T> MongoIterable<T>.kCursor(): MongoCursorIterable<T> = MongoCursorIterable(iterator())

fun <T, R> MongoIterable<T>.useCursor(block: (Iterable<T>) -> R): R {
    return kCursor().use(block)
}

In [4]:
import org.bson.BsonNull
import org.bson.BsonString
import org.jetbrains.kotlinx.dataframe.math.mean
import org.jetbrains.kotlinx.dataframe.math.median

val vulnGavToClasses = vulnCollection.find().useCursor { blk ->
    blk.map {
        val gav = it["vuln_gav"]!!.asString().value
        val vulnClasses = it["vuln_classes"]!!.asArray().map { it.asString().value }.toSet()

        if (vulnClasses.count() == 0) throw Exception("no vuln class (record should have been ommitted previously)")

        gav to vulnClasses
    }
}.groupBy { it.first }.map { g -> g.key to g.value.flatMap { it -> it.second }.toSet() }.toMap()

println(vulnGavToClasses.count())
println(vulnGavToClasses.map { it.value.size }.minOrNull())
println(vulnGavToClasses.map { it.value.size }.maxOrNull())
println(vulnGavToClasses.map { it.value.size }.average())
println(vulnGavToClasses.map { it.value.size }.median())

3621
1
51
3.3526650096658384
2


In [5]:
val vulnCveGavToClasses = vulnCollection.find().useCursor { blk ->
    blk.map {
        val gav = it["vuln_gav"]!!.asString().value
        val cve = it["cve_ref"]!!.asString().value.let { c -> if (c.isBlank()) it["snyk_url"]!!.asString().value else c }
        val vulnClasses = it["vuln_classes"]!!.asArray().map { it.asString().value }.toSet()

        if (vulnClasses.count() == 0) throw Exception("no vuln class (record should have been ommitted previously)")

        (cve to gav) to vulnClasses
    }
}.groupBy { it.first }.map { g -> g.key to g.value.flatMap { it -> it.second }.toSet() }.toMap()
    
println(vulnCveGavToClasses.count())
println(vulnCveGavToClasses.map { it.value.size }.minOrNull())
println(vulnCveGavToClasses.map { it.value.size }.maxOrNull())
println(vulnCveGavToClasses.map { it.value.size }.average())
println(vulnCveGavToClasses.map { it.value.size }.median())

6787
1
51
2.038455871519081
1


In [6]:
val depGraphCacheDir = dataDir.resolve("interim/depGraphCache")

val gavToDepGraphCache = vulnGavToClasses
    .map {
        val cachePath = depGraphCacheDir.resolve("${it.key}.vertices.tsv.zip")
        if (!cachePath.isFile()) throw Exception("cache file not found")
        
        it.key to cachePath
    }
    .toMap()
gavToDepGraphCache.count()

3621

In [7]:
import common.DefaultGraph
import io.github.classgraph.ClassInfoList
import java.io.File
import java.net.URLClassLoader
import org.jgrapht.Graph
import org.jgrapht.graph.DefaultEdge
import org.jgrapht.graph.builder.GraphTypeBuilder

fun loadDepGraph(gav: String): DefaultGraph = 
    scripts.exportDepGraphs.loadDepGraphFromCache(gav)

fun loadClassListInfo(gav: String): ClassInfoList =
    scripts.exportDepGraphs.loadClassInfoListFromCache(gav)

fun loadVertexInfo(gav: String) =
    scripts.exportDepGraphs.loadVertexInfo(gav)

### how much of the public API is reaching vulns?

#### first we count if it hits "any" vuln class:

In [8]:
import com.google.common.collect.Queues

/**
 * returns the depths of each visited node from the starting node
 */
fun bfsOnDepGraph(graph: DefaultGraph, startNode: String): Map<String, Int> {
    val expanded = mutableSetOf<String>()
    val depthMap = mutableMapOf<String, Int>()
    val queue = Queues.newArrayDeque<String>()
    queue.add(startNode)
    depthMap[startNode] = 0
    while (!queue.isEmpty()) {
        val node = queue.pop()
        expanded.add(node)
        val parentDepth = depthMap[node]!! 
        graph
            .outgoingEdgesOf(node)
            .map { e -> graph.getEdgeTarget(e) }
            .filterNot { expanded.contains(it) }
            .forEach { v ->
                if (depthMap.containsKey(v)) {
                    depthMap[v] = min(depthMap[v]!!, parentDepth + 1)
                } else {
                    depthMap[v] = parentDepth + 1;
                }
                queue.add(v)
            }
    }
    return depthMap
}

In [9]:
import common.DefaultGraph
import io.github.classgraph.ClassGraph
import io.github.classgraph.ClassInfo
import io.github.classgraph.ClassInfoList
import kotlinx.coroutines.async
import org.jgrapht.alg.shortestpath.FloydWarshallShortestPaths
import org.jgrapht.alg.shortestpath.GraphMeasurer
import org.jgrapht.graph.EdgeReversedGraph
import org.jgrapht.traverse.BreadthFirstIterator
import java.lang.reflect.Modifier
import kotlin.Exception

data class VulnAnalysisInfo(
    val classCount: Int,
    val publicClassCount: Int,
    val vulnClassCount: Int,
    val vulnPublicClassCount: Int,
    val averageDistToVuln: Double,
    val minDistToVuln: Int,
    val maxDistToVuln: Int,
    val medianDistToVuln: Int,
    // val publicClassesWithPublicApiCount: Int,
    // val depGraph: DefaultGraph,
) {
    val apiSurfaceRatio = publicClassCount.toDouble() / classCount.toDouble()
    val vulnApiSurfaceRatio = vulnPublicClassCount.toDouble() / publicClassCount.toDouble()
    val vulnRatio = vulnClassCount.toDouble() / classCount.toDouble()
}

System.gc()

@kotlinx.coroutines.ExperimentalCoroutinesApi fun _vulnInfo() =
    Dispatchers.IO.limitedParallelism(32).let { dispatcher ->
        vulnCveGavToClasses.asSequence().windowed(512, 512, true).flatMapIndexed { batch, w ->
            runBlocking {
                System.gc()
                println("processing batch $batch")
                w.map {
                    async(dispatcher) {
                        val gav = it.key.second
                        val vulnClasses = it.value

                        val depGraph = loadDepGraph(gav)
                        
                        val vertexInfo = loadVertexInfo(gav)

                        val publicClasses =
                            depGraph.vertexSet().filter { v -> Modifier.isPublic(vertexInfo[v]!!) }

                        if (publicClasses.count() > depGraph.vertexSet().count()) {
                            error("#public classes should be <= #classes")
                        }

                        if (publicClasses.count() == 0) {
                            return@async Result.failure(Exception("no public class in $gav"))
                        }

                        val vulnClassesInJar = vulnClasses.intersect(depGraph.vertexSet())
                        if (vulnClassesInJar.size != vulnClasses.size) {
                            error("vuln class number mismatch ${gav}\n***\n$vulnClassesInJar\n***\n$vulnClasses")
                        }

                        val distToNearestVuln = mutableMapOf<String, Int>()
                        vulnClassesInJar.map {
                            // BreadthFirstIterator(EdgeReversedGraph(depGraph), it).asSequence().toSet()
                            bfsOnDepGraph(EdgeReversedGraph(depGraph), it)
                        }.forEach { m ->
                            m.forEach { v, d -> 
                                if (distToNearestVuln.containsKey(v)) {
                                    distToNearestVuln[v] = min(distToNearestVuln[v]!!, d)
                                } else {
                                    distToNearestVuln[v] = d
                                }
                            }
                        }
                        
                        val allVulnClasses = distToNearestVuln.keys.toSet()

                        val vulnPublicClasses = publicClasses.intersect(allVulnClasses)

                        val vulnPublicClassRatio =
                            vulnPublicClasses.count().toDouble() / publicClasses.count().toDouble()

                        if (vulnPublicClassRatio > 1) {
                            error("ratio should be <= 1")
                        }
                        
                        val averageDistToVuln = distToNearestVuln.values.average()
                        val minDistToVuln = distToNearestVuln.values.min()
                        val maxDistToVuln = distToNearestVuln.values.max()
                        val medianDistToVuln = distToNearestVuln.values.median()
                        
                        // val classes = loadClassListInfo(gav)
                        // val classToClassInfo = classes.filter { it.name in depGraph.vertexSet() }.map { it.name to it }.toMap()

                        // val publicClassesWithPublicApi =
                        //     publicClasses
                        //         .filter { c -> classToClassInfo[c]!!.let { it.methodInfo.any { it.isPublic } || it.fieldInfo.any { it.isPublic } } }
                        //         .toSet()

                        val res = VulnAnalysisInfo(
                            classCount = depGraph.vertexSet().count(),
                            publicClassCount = publicClasses.count(),
                            vulnClassCount = allVulnClasses.count(),
                            vulnPublicClassCount = vulnPublicClasses.count(),
                            averageDistToVuln = averageDistToVuln,
                            minDistToVuln = minDistToVuln,
                            maxDistToVuln = maxDistToVuln,
                            medianDistToVuln = medianDistToVuln,
                            // publicClassesWithPublicApiCount = publicClassesWithPublicApi.count()
                            // depGraph = depGraph,
                        )

                        return@async Result.success(res)
                    }
                }.toList().awaitAll().toList()
            }.toList().filter { it.isSuccess }.map { it.getOrNull()!! }.toList()
        }.toList()
    }

val vulnInfo = _vulnInfo()
vulnInfo.count()

processing batch 0
processing batch 1
processing batch 2
processing batch 3
processing batch 4
processing batch 5
processing batch 6
processing batch 7
processing batch 8
processing batch 9
processing batch 10
processing batch 11
processing batch 12
processing batch 13


6787

In [10]:
System.gc()

In [11]:
fun Double.format(digits: Int) =
    "%.${digits}f".format(this)

fun Float.format(digits: Int) =
    "%.${digits}f".format(this)

In [12]:
println("classCount median: ${vulnInfo.map { it.classCount }.median()} avg: ${vulnInfo.map { it.classCount }.average().format(2)}")
println("publicClassCount median: ${vulnInfo.map { it.publicClassCount }.median()} avg: ${vulnInfo.map { it.publicClassCount }.average().format(2)}")
// DISPLAY("publicClassesWithPublicApiCount: ${vulnInfo.map { it.publicClassesWithPublicApiCount }.median()}")
println("vulnClassCount median: ${vulnInfo.map { it.vulnClassCount }.median()} avg: ${vulnInfo.map { it.vulnClassCount }.average().format(2)}")
println("vulnPublicClassCount median: ${vulnInfo.map { it.vulnPublicClassCount }.median()} avg: ${vulnInfo.map { it.vulnPublicClassCount }.average().format(2)}")
println("apiSurfaceRatio median: ${vulnInfo.map { it.apiSurfaceRatio * 100 }.median().format(2)} avg: ${vulnInfo.map { it.apiSurfaceRatio * 100 }.average().format(2)}")
// DISPLAY("publicClassesWithPublicApiRatio: ${vulnInfo.map { it.publicClassesWithPublicApiCount.toDouble() / it.classCount.toDouble() }.map { it * 100 }.median()}")
println("vulnApiSurfaceRatio median: ${vulnInfo.map { it.vulnApiSurfaceRatio * 100 }.median().format(2)} avg: ${vulnInfo.map { it.vulnApiSurfaceRatio * 100 }.average().format(2)}")

classCount median: 317 avg: 587.63
publicClassCount median: 286 avg: 539.19
vulnClassCount median: 13 avg: 187.10
vulnPublicClassCount median: 13 avg: 176.65
apiSurfaceRatio median: 93.79 avg: 91.64
vulnApiSurfaceRatio median: 9.07 avg: 26.22


In [13]:
val p =
    letsPlot(mapOf(
        "number of classes" to vulnInfo.map { it.classCount.coerceAtMost(2800) })
    ) +
            geomHistogram(alpha = .3, binWidth = 200, center = 100) { x = "number of classes"; } +
            xlim(limits = Pair(0, 3000)) +
            ggsize(1800, 800)
p

In [14]:
val p =
    letsPlot(mapOf(
        "number of public classes" to vulnInfo.map { it.publicClassCount.coerceAtMost(2800) })
    ) +
            geomHistogram(alpha = .3, binWidth = 200, center = 100) { x = "number of public classes"; } +
            xlim(limits = Pair(0, 3000)) +
            ggsize(1800, 800)
p

In [15]:
val p =
    letsPlot(mapOf(
        "number of vulnerable classes" to vulnInfo.map { it.vulnClassCount.coerceAtMost(2800) })
    ) +
            geomHistogram(alpha = .3, binWidth = 200, center = 100) { x = "number of vulnerable classes"; } +
            xlim(limits = Pair(0, 3000)) +
            ggsize(1800, 800)
p

In [16]:
val p =
    letsPlot(mapOf(
        "number of vuln public classes" to vulnInfo.map { it.vulnPublicClassCount.coerceAtMost(2800) })
    ) +
            geomHistogram(alpha = .3, binWidth = 200, center = 100) { x = "number of vuln public classes"; } +
            xlim(limits = Pair(0, 3000)) +
            ggsize(1800, 800)
p

In [17]:
val p =
    letsPlot(mapOf(
        "apiSurfaceRatio" to vulnInfo.map { it.apiSurfaceRatio * 100 })
    ) +
            geomHistogram(alpha = .3, binWidth = 5, center = 2.5) { x = "apiSurfaceRatio"; } +
            xlim(limits = Pair(0, 100)) +
            ggsize(1800, 800)
p

In [18]:
val p =
    letsPlot(mapOf(
        "vulnApiSurfaceRatio" to vulnInfo.map { it.vulnApiSurfaceRatio * 100 })
    ) +
            geomHistogram(alpha = .3, binWidth = 5, center = 2.5) { x = "vulnApiSurfaceRatio"; } +
            xlim(limits = Pair(0, 100)) +
            ggsize(1800, 800)
p

what is the depth of vulnerability from public classes?

In [19]:
println("medianDistanceToVuln median: ${vulnInfo.map { it.medianDistToVuln }.median()} avg: ${vulnInfo.map { it.medianDistToVuln }.average().format(2)}")
println("averageDistanceToVuln median: ${vulnInfo.map { it.averageDistToVuln }.median()} avg: ${vulnInfo.map { it.averageDistToVuln }.average().format(2)}")
println("minDistanceToVuln median: ${vulnInfo.map { it.minDistToVuln }.median()} avg: ${vulnInfo.map { it.minDistToVuln }.average().format(2)}")
println("maxDistanceToVuln median: ${vulnInfo.map { it.maxDistToVuln }.median()} avg: ${vulnInfo.map { it.maxDistToVuln }.average().format(2)}")

medianDistanceToVuln median: 2 avg: 2.02
averageDistanceToVuln median: 1.5 avg: 2.01
minDistanceToVuln median: 0 avg: 0.00
maxDistanceToVuln median: 3 avg: 3.92


In [20]:
val p =
    letsPlot(mapOf(
        "medianDistanceToVuln" to vulnInfo.map { it.medianDistToVuln })
    ) +
            geomHistogram(alpha = .3, binWidth = 1, center = .5) { x = "medianDistanceToVuln"; } +
            xlim(limits = Pair(0, 10)) +
            ggsize(1800, 800)
p

In [21]:
val p =
    letsPlot(mapOf(
        "averageDistanceToVuln" to vulnInfo.map { it.averageDistToVuln })
    ) +
            geomHistogram(alpha = .3, binWidth = 1, center = .5) { x = "averageDistanceToVuln"; } +
            xlim(limits = Pair(0, 10)) +
            ggsize(1800, 800)
p

In [22]:
val p =
    letsPlot(
        mapOf("minDistanceToVuln" to vulnInfo.map { it.maxDistToVuln })
    ) { x = "minDistanceToVuln" } + ggsize(1800, 800) +
            geomHistogram(alpha = .3, binWidth = 1, center = .5) +
            geomDensity(alpha = 0, size = 1, adjust = 2) { y = "..count.." } + 
            xlim(limits = Pair(0, 30))

p