In [1]:
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.runBlocking
import com.google.common.hash.Hashing
import io.github.cdimascio.dotenv.Dotenv
import java.nio.file.Paths

%useLatestDescriptors
%use lets-plot

class Main

val dotenv = Dotenv.load()
val dataDir = dotenv.get("DATA_DIR").let { Paths.get(it).toFile() }.also { it.mkdirs() }
dataDir

/home/j.zeidler@GDAAG.DE/private/ba3/bachelor-thesis/data

In [2]:
import org.bson.BsonDocument
import org.litote.kmongo.*

val client = KMongo.createClient("mongodb://localhost:42692/")
val db = client.getDatabase("s5_snyk_libio")
val vulnCollection = db.getCollection<BsonDocument>("mergedVuln")

In [3]:
import com.mongodb.client.*

private class MongoCursorIterable<T>(private val cursor: MongoCursor<T>) : MongoCursor<T> by cursor, Iterable<T> {

    override fun iterator(): Iterator<T> = cursor
}

private fun <T> MongoIterable<T>.kCursor(): MongoCursorIterable<T> = MongoCursorIterable(iterator())

fun <T, R> MongoIterable<T>.useCursor(block: (Iterable<T>) -> R): R {
    return kCursor().use(block)
}

In [4]:
import org.bson.BsonNull
import org.bson.BsonString
import org.jetbrains.kotlinx.dataframe.math.mean
import org.jetbrains.kotlinx.dataframe.math.median

val vulnGavToClasses = vulnCollection.find().useCursor { blk ->
    blk.map {
        val gav = it["vuln_gav"]!!.asString().value
        val vulnClasses = it["vuln_classes"]!!.asArray().map { it.asString().value }.toSet()

        if (vulnClasses.count() == 0) throw Exception("no vuln class (record should have been ommitted previously)")

        gav to vulnClasses
    }
}.groupBy { it.first }.map { g -> g.key to g.value.flatMap { it -> it.second }.toSet() }.toMap()

println(vulnGavToClasses.count())
println(vulnGavToClasses.map { it.value.size }.minOrNull())
println(vulnGavToClasses.map { it.value.size }.maxOrNull())
println(vulnGavToClasses.map { it.value.size }.average())
println(vulnGavToClasses.map { it.value.size }.median())

3809
1
51
3.549750590706222
2


In [5]:
val vulnCveGavToClasses = vulnCollection.find().useCursor { blk ->
    blk.map {
        val gav = it["vuln_gav"]!!.asString().value
        val cve = it["cve_ref"]!!.asString().value.let { c -> if (c.isBlank()) it["snyk_url"]!!.asString().value else c }
        val vulnClasses = it["vuln_classes"]!!.asArray().map { it.asString().value }.toSet()

        if (vulnClasses.count() == 0) throw Exception("no vuln class (record should have been ommitted previously)")

        (cve to gav) to vulnClasses
    }
}.groupBy { it.first }.map { g -> g.key to g.value.flatMap { it -> it.second }.toSet() }.toMap()
    
println(vulnCveGavToClasses.count())
println(vulnCveGavToClasses.map { it.value.size }.minOrNull())
println(vulnCveGavToClasses.map { it.value.size }.maxOrNull())
println(vulnCveGavToClasses.map { it.value.size }.average())
println(vulnCveGavToClasses.map { it.value.size }.median())

7618
1
51
2.0490942504594383
1


In [6]:
val data = mapOf(
    "x" to vulnCveGavToClasses.map { it.value.size } + vulnGavToClasses.map { it.value.size },
    "t" to vulnCveGavToClasses.map { "vulnGavClassesPair" } + vulnGavToClasses.map { "vulnGavToClasses" }
)

val p = letsPlot(data) +
        geomHistogram(alpha = .3, binWidth = 1, center = 1.5) { x = "x"; color = "t"; fill = "t" } +
        xlim(limits = Pair(1, 10)) +
        ggsize(1800, 800) 
p

In [7]:
vulnCveGavToClasses.asSequence().shuffled().first()

(CVE-2023-33201, org.bouncycastle:bcprov-jdk15on:1.58)=[org.bouncycastle.jce.provider.X509LDAPCertStoreSpi]

In [8]:
val depGraphCacheDir = dataDir.resolve("interim/depGraphCache")

val gavToDepGraphCache = vulnGavToClasses
    .map {
        val cachePath = depGraphCacheDir.resolve("${it.key}.vertices.tsv.zip")
        if (!cachePath.isFile()) throw Exception("cache file not found")
        
        it.key to cachePath
    }
    .toMap()
gavToDepGraphCache.count()

3809

there are 3812 gavs

In [9]:
import common.DefaultGraph
import io.github.classgraph.ClassInfoList
import java.io.File
import java.net.URLClassLoader
import org.jgrapht.Graph
import org.jgrapht.graph.DefaultEdge
import org.jgrapht.graph.builder.GraphTypeBuilder

fun loadDepGraph(gav: String): DefaultGraph = 
    scripts.exportDepGraphs.loadDepGraphFromCache(gav)

fun loadClassListInfo(gav: String): ClassInfoList =
    scripts.exportDepGraphs.loadClassInfoListFromCache(gav)

fun loadVertexInfo(gav: String) =
    scripts.exportDepGraphs.loadVertexInfo(gav)

In [10]:
// import common.toJGraph
// import io.github.classgraph.ClassGraph
// import io.github.classgraph.ClassInfoList
// import kotlinx.coroutines.async
// import org.jgrapht.alg.shortestpath.GraphMeasurer
// import org.jgrapht.graph.EdgeReversedGraph
// import org.jgrapht.traverse.BreadthFirstIterator
//
// data class GeneralStat(
//     // val radius: Double,
//     val diameter: Double,
// )
//
// val dispatcher = Dispatchers.IO.limitedParallelism(32)
//
// @kotlinx.coroutines.ExperimentalCoroutinesApi fun _getGeneralStats() =
//     vulnGavToClasses.keys.asSequence().windowed(512, 512, true).flatMapIndexed { batch, w ->
//         runBlocking {
//             System.gc()
//             println("processing batch $batch")
//             w.map {
//                 async(dispatcher) {
//                     val gav = it
//                     val depGraph = loadDepGraph(gav)
//                     val graphMeasurer = GraphMeasurer(depGraph)
//
//                     GeneralStat(
//                         // radius = graphMeasurer.radius,
//                         diameter = graphMeasurer.diameter,
//                     )
//                 }
//             }.toList().awaitAll().toList()  //.filter { it.v != null }
//         }.toList()
//     }.toList()
//
// val stats = _getGeneralStats()
// stats.count()

### how much of the library is dependent on the vuln class?

In [11]:
import common.toJGraph
import io.github.classgraph.ClassGraph
import io.github.classgraph.ClassInfoList
import kotlinx.coroutines.async
import org.jgrapht.graph.EdgeReversedGraph
import org.jgrapht.traverse.BreadthFirstIterator

data class Res(
    val msg: String? = null,
    val v: Triple<List<Double>, Double, Double>? = null,
    val v2: Double? = null,
)

val dispatcher = Dispatchers.IO.limitedParallelism(32)

@kotlinx.coroutines.ExperimentalCoroutinesApi fun _gavToVulnRatio() =
    vulnCveGavToClasses.asSequence().windowed(512, 512, true).flatMapIndexed { batch, w ->
        runBlocking {
            System.gc()
            println("processing batch $batch")
            w.map {
                async(dispatcher) {
                    val gav = it.key.second
                    val vulnClasses = it.value
                    
                    val depGraph = loadDepGraph(gav)
                    
                    val vulnClassesInJar = vulnClasses.intersect(depGraph.vertexSet())
                    if (vulnClassesInJar.size != vulnClasses.size) {
                        println("vuln class number mismatch ${gav}")
                        println("$vulnClassesInJar")
                        println("$vulnClasses")
                    }

                    val x = vulnClassesInJar.map {
                        val _vulnClasses =
                            BreadthFirstIterator(EdgeReversedGraph(depGraph), it).asSequence().toSet()

                        val vulnRatio = _vulnClasses.size.toDouble() / depGraph.vertexSet().size.toDouble()
                        _vulnClasses to vulnRatio
                    }

                    val vulnClassRatioList = x.map { it.second }
                    val vulnRatioMedian = vulnClassRatioList.median()
                    val vulnRatioAvg = vulnClassRatioList.average()

                    val allVulnClasses = x.flatMap { it.first }.toSet()

                    Res(
                        v = Triple(vulnClassRatioList, vulnRatioMedian, vulnRatioAvg),
                        v2 = allVulnClasses.count().toDouble() / depGraph.vertexSet().count().toDouble()
                    )
                }
            }.toList().awaitAll().toList()  //.filter { it.v != null }
        }.toList()
    }.toList()

val res = _gavToVulnRatio()

// println(res.filter { it.msg != null }.map { it.msg!! }.joinToString("\n"))

processing batch 0
processing batch 1
processing batch 2
processing batch 3
processing batch 4
processing batch 5
processing batch 6
processing batch 7
processing batch 8
processing batch 9
processing batch 10
processing batch 11
processing batch 12
processing batch 13
processing batch 14


In [12]:
val gavToVulnRatioPerClass = res.map { it.v!! }.toList()
val gavToVulnRatioTotal = res.map { it.v2!! }.toList()
gavToVulnRatioPerClass.count()

7618

In [13]:
// ([vuln ratio], vuln ratio median, vuln ratio avg)
gavToVulnRatioPerClass.sortedByDescending { abs(it.third - it.second) }.joinToString("\n")

([0.01990049751243781, 0.022388059701492536, 0.02736318407960199, 0.024875621890547265, 0.8855721393034826, 0.8855721393034826, 0.8855721393034826, 0.022388059701492536], 0.026119402985074626, 0.3467039800995025)
([0.8855721393034826, 0.01990049751243781, 0.022388059701492536, 0.02736318407960199, 0.8855721393034826, 0.8855721393034826, 0.024875621890547265, 0.022388059701492536], 0.026119402985074626, 0.34670398009950243)
([0.718421052631579, 0.718421052631579, 0.718421052631579, 0.002631578947368421, 0.718421052631579, 0.718421052631579, 0.002631578947368421, 0.7210526315789474, 0.002631578947368421, 0.010526315789473684, 0.718421052631579, 0.718421052631579, 0.718421052631579, 0.718421052631579, 0.002631578947368421, 0.718421052631579, 0.005263157894736842, 0.718421052631579, 0.002631578947368421, 0.09473684210526316, 0.718421052631579, 0.718421052631579, 0.718421052631579, 0.002631578947368421, 0.002631578947368421, 0.718421052631579, 0.718421052631579, 0.002631578947368421, 0.0026

In [14]:
gavToVulnRatioPerClass.flatMap {
    it.first
}.average()

0.2645608677570156

In [15]:
gavToVulnRatioPerClass.flatMap {
    it.first
}.median()

0.09113300492610837

In [16]:
gavToVulnRatioTotal.sorted().joinToString(", ")

4.5641259698767684E-5, 1.180080245456691E-4, 1.2004801920768308E-4, 1.3243279035889287E-4, 1.3308490817141337E-4, 1.422677479015507E-4, 3.192338387869114E-4, 4.5475216007276033E-4, 4.5475216007276033E-4, 4.5599635202918376E-4, 4.5599635202918376E-4, 4.675081813931744E-4, 4.830917874396135E-4, 4.830917874396135E-4, 4.8638132295719845E-4, 4.8638132295719845E-4, 4.965243296921549E-4, 4.965243296921549E-4, 5.154639175257732E-4, 5.154639175257732E-4, 5.402485143165856E-4, 5.402485143165856E-4, 5.458515283842794E-4, 5.458515283842794E-4, 5.458515283842794E-4, 5.458515283842794E-4, 5.599104143337066E-4, 5.599104143337066E-4, 5.599104143337066E-4, 5.599104143337066E-4, 5.633802816901409E-4, 5.633802816901409E-4, 5.633802816901409E-4, 5.633802816901409E-4, 6.042296072507553E-4, 6.042296072507553E-4, 6.042296072507553E-4, 6.042296072507553E-4, 6.146281499692685E-4, 6.146281499692685E-4, 6.146281499692685E-4, 6.146281499692685E-4, 6.397952655150352E-4, 6.397952655150352E-4, 6.397952655150352E-4, 

In [17]:
// this is what we care the most
println(gavToVulnRatioTotal.median())
println(gavToVulnRatioTotal.average())

0.1
0.2629158947779851


In [18]:
val data = mapOf(
    "x" to gavToVulnRatioPerClass.map { it.second * 100 } + gavToVulnRatioTotal.map { it * 100 },
    "color" to gavToVulnRatioPerClass.map { "vulnRatioMedianPerClass" } + gavToVulnRatioTotal.map { "vulnRatioMedian" },
)

val p = letsPlot(data) +
        geomHistogram(alpha = .6, binWidth = 5, center = 0, position = positionDodge()) { x = "x"; color = "color"; fill = "color"} +
        xlim(Pair(0, 100)) +
        ggsize(1400, 700)
p

there is little dependency on vuln classes. it means we can hope that if we modularize the libraries, most artifacts (modules) will be non-vulnerable and still usable.