In [1]:
%use kotlin-statistics, krangl, kravis, lets-plot

In [2]:
import kotlin.math.*

In [3]:
@file:Repository("https://dl.bintray.com/kyonifer/maven")
@file:DependsOn("com.kyonifer:koma-core-ejml:0.12")
@file:DependsOn("com.kyonifer:koma-plotting:0.12")

In [4]:
import koma.extensions.*
import koma.*

In [5]:
fun trainTestSplit(path: String, fraction: Double = 0.8): Pair<DataFrame, DataFrame> {
    val df = DataFrame.readTSV(path)
    val trainCount: Int = kotlin.math.ceil(fraction * df.nrow).toInt()
    val shuffledDf = df.shuffle()
    return Pair(shuffledDf.slice(0..trainCount), shuffledDf.slice(trainCount + 1..shuffledDf.nrow))
}

In [6]:
val (train, test) = trainTestSplit("../data/test/test_data.tsv")

In [7]:
class NaiveBayesOneVAll(private val alpha: Double = 1e-5, val cat: String = "coding") {

    var p: Double = 0.0
    var probs: ArrayList<koma.matrix.Matrix<Double>> = ArrayList()

    var voc: Map<String, Int> = HashMap()
    lateinit var id2word: List<String>
    private var n: Int = 0

    fun initialize(df: DataFrame, limit: Int = 5, top: Int = 0, n: Int = 1) {
        this.n = n
        val events = df["events"]
        val eventsMap: MutableMap<String, Int> = HashMap()

        for (i in 0 until events.length) {
            val eventsArr = df["events"][i].toString().split(" , ")
            for (j in 0 until eventsArr.size - n + 1) {
                val ev = eventsArr.slice(j until j + n).joinToString()
                if (eventsMap.containsKey(ev)) {
                    eventsMap[ev] = eventsMap.getValue(ev) + 1
                } else {
                    eventsMap[ev] = 1
                }
            }
        }

        id2word = eventsMap.filter { it.value > limit }
            .toSortedMap(compareBy({ eventsMap[it]?.times(-1) }, { it })).keys.toList()

        id2word = id2word.slice(top until id2word.size)
        voc = id2word.withIndex().toList().associate { it.value to it.index }
    }

    fun transform(df: DataFrame): Pair<koma.matrix.Matrix<Double>, List<String>> {
        val mat = zeros(df.nrow, voc.size + 2)
        val label = ArrayList<String>()
        for (i in 0 until df.nrow) {
            label.add(df["Category"][i].toString())

            val eventsArr = df["events"][i].toString().split(" , ")
            for (j in 0 until eventsArr.size - n + 1) {
                val event = eventsArr.slice(j until j + n).joinToString()
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return Pair(mat, label)
    }


    fun fit (df: DataFrame) {
        val (X, y) = transform(df)

        probs.add(ones(X.shape()[1], 11) * alpha)
        probs.add(ones(X.shape()[1], 11) * alpha)

        for (i in y.indices) {
            var cls = 0
            if (!y[i].toLowerCase().contains(cat))
                cls = 1
            else
                p++


            for (j in 0 until X.shape()[1]) {
                val ind: Int = if (X[i, j] < 10.0) X[i, j].toInt() else 10
                probs[cls][j, ind] += 1

            }

        }

        p /= y.size
        for (cls in 0 until 2) {
            var probsMatrix = probs[cls]
            val den = (1 + alpha) * probsMatrix.getRow(0).elementSum().toInt()
            probsMatrix /= den
            probs[cls] = probsMatrix
        }
    }


    fun prob(X: koma.matrix.Matrix<Double>): List<Double> {
        val ans = zeros(X.shape()[0], 2)
        for (i in 0 until X.shape()[0]) {
            for (cls in 0 until 2) {
                ans[i, cls] = kotlin.math.ln(p * (1 - cls) + (1 - p) * cls)
                for (k in 0 until X.shape()[1]) {
                    val ind = if (X[i, k] < 10.0) X[i, k].toInt() else 10
                    ans[i, cls] += kotlin.math.ln(probs[cls][k, ind])
                }
            }
        }
        val q = fill(
            ans.shape()[0],
            ans.shape()[1]
        ) { i, j -> 1 / (exp(ans[i, 0 until ans.shape()[1]] - ans[i, j]).mean() * ans.shape()[1]) }
        return q.getCol(0).toList()
    }

    fun expl(X: koma.matrix.Matrix<Double>): ArrayList<List<Double>> {
        val ans = ArrayList<List<Double>>()

        for (i in 0 until X.shape()[0]) {
            val semi = zeros(X.shape()[1], 1)

            for (k in 0 until X.shape()[1]) {
                val ind = if (X[i, k] < 10.0) X[i, k].toInt() else 10
                semi[k] += kotlin.math.ln(probs[0][k, ind]) - kotlin.math.ln(probs[1][k, ind])
            }
            ans.add(semi.toList())
        }

        return ans
    }

    private fun impact(
        expl: ArrayList<List<Double>>,
        n: Int = 5
    ): Pair<ArrayList<List<String>>, ArrayList<List<String>>> {
        val pos = ArrayList<List<String>>()
        val negs = ArrayList<List<String>>()

        for (row in expl) {
            val cv = ArrayList<Pair<String, Double>>()

            for (j in id2word.indices) {
                cv.add(Pair(id2word[j], row[j]))
            }

            cv.sortBy { -it.second }
            pos.add(cv.slice(0 until n).map { it.first })
            negs.add(cv.slice(cv.size - n until cv.size).map { it.first })
        }
        return Pair(pos, negs)
    }

    fun getPosNeg(X: koma.matrix.Matrix<Double>): Pair<ArrayList<List<String>>, ArrayList<List<String>>> {
        val expl = expl(X)
        return impact(expl)
    }

}

In [8]:
fun integrate(x: ArrayList<Double>, y: ArrayList<Double>): Double {
    var res = 0.0
    for (i in 0 until y.size - 1) {
        res -= (x[i + 1] - x[i]) * (y[i + 1] + y[i]) / 2
    }
    return res
}

In [9]:
class Predicition(
    val probs: HashMap<String, Double> = HashMap(),
    val pos: HashMap<String, List<String>> = HashMap(),
    val negs: HashMap<String, List<String>> = HashMap()
) {
    fun add(cat: String, prob: Double, ps: List<String>, ns: List<String>) {
        probs[cat] = prob
        pos[cat] = ps
        negs[cat] = ns
    }
}

In [55]:
class OneVSAll(private val alpha: Double = 1e-5) {

    val classes = listOf(
        "reading", "coding", "ide_start",
        "notifications", "rdb", "database",
        "ide_close", "vcs", "terminal", "settings"
    )

    val limit = mapOf(
        "reading" to 5, "coding" to 0, "ide_start" to 0,
        "notifications" to 0, "rdb" to 0, "database" to 5,
        "ide_close" to 0, "vcs" to 0, "terminal" to 7, "settings" to 0
    )

    val top = mapOf(
        "reading" to 0, "coding" to 0, "ide_start" to 0,
        "notifications" to 0, "rdb" to 0, "database" to 10,
        "ide_close" to 5, "vcs" to 0, "terminal" to 4, "settings" to 0
    )

    var classifiers = ArrayList<NaiveBayesOneVAll>()

    fun initialize(df: DataFrame, n: Int = 1) {
        for (cls in classes) {
            val newClass = NaiveBayesOneVAll(cat = cls)
            newClass.initialize(df, limit[cls] ?: error(""), top[cls] ?: error(""), n)
            classifiers.add(newClass)
        }
    }

    fun fit(df: DataFrame) {
        for (cl in classifiers) {
            cl.fit(df)
        }
    }

    fun predict(df: DataFrame): Triple<ArrayList<Predicition>, ArrayList<Double>, ArrayList<Map<String, ArrayList<Double>>>> {
        val ROCs = ArrayList<Map<String, ArrayList<Double>>>()
        val AUCs = ArrayList<Double>()
        val preds = ArrayList<Predicition>()

        for (i in 0 until df.nrow)
            preds.add(Predicition())

        for (i in classes.indices) {
            val (X_bow, y_bow) = classifiers[i].transform(df)
            val prob = classifiers[i].prob(X_bow)
            val (roc, auc) = ROCAUC(prob, y_bow, classes[i])
            ROCs.add(roc)
            AUCs.add(auc)
            val (pos, negs) = classifiers[i].getPosNeg(X_bow)
            for (j in prob.indices) {
                preds[j].add(classes[i], prob[j], pos[j], negs[j])
            }
        }

        return Triple(preds, AUCs, ROCs)
    }

    private fun ROCAUC(
        prob: List<Double>,
        y: List<String>,
        cat: String
    ): Pair<Map<String, ArrayList<Double>>, Double> {
        val nexs = ArrayList<Double>()
        val pexs = ArrayList<Double>()
        val a = ArrayList<Pair<Double, Int>>()
        for (i in y.indices) {
            if (y[i].toLowerCase().contains(cat))
                a.add(Pair(prob[i], 1))
            else
                a.add(Pair(prob[i], 0))
        }

        a.sortBy { it.first }
        var c = 0.01
        val z = DoubleArray(2)
        nexs.add(0.0)
        pexs.add(0.0)

        for (v in a) {
            if (v.first > c) {
                c += 0.01
                nexs.add(z[0])
                pexs.add(z[1])
            }
            z[v.second]++
        }

        nexs.add(z[0])
        pexs.add(z[1])

        val pe = pexs[pexs.size - 1]
        val ne = nexs[nexs.size - 1]

        for (i in pexs.indices) {
            pexs[i] = pe - pexs[i]
            nexs[i] = ne - nexs[i]

        }
        val rec = ArrayList<Double>()
        val prec = ArrayList<Double>()

        for (i in pexs.indices) {
            rec.add(pexs[i] / pexs[0])
        }

        for (i in pexs.indices) {
            if (pexs[i] + nexs[i] == 0.0)
                prec.add(1.0)
            else
                prec.add(pexs[i] / (nexs[i] + pexs[i]))
        }

        val df = mapOf("recall" to rec, "precision" to prec)
        val auc = integrate(rec, prec)
        return Pair(df, auc)
    }



}

var cl = OneVSAll()

In [56]:
cl.initialize(train)

In [57]:
cl.fit(train)

In [58]:
val (preds, AUCs, ROCs) = cl.predict(train)

In [59]:
AUCs

[0.9566777051200045, 0.9910118004216806, 0.9544417010660593, 0.9108883731816593, 0.9972831857139055, 1.0000000000000002, 0.9187626262626262, 0.9993162017964459, 0.9472697903947906, 0.9642045454545455]

In [29]:
AUCs

[0.9566777051200045, 0.9910118004216806, 0.9544417010660593, 0.9108883731816593, 0.9972831857139055, 0.7985909152575821, 0.9187626262626262, 0.9993162017964459, 0.966267116597999, 0.9642045454545455]

In [60]:
var posEvents = HashMap<String, ArrayList<String>>()
var negEvents = HashMap<String, ArrayList<String>>()

In [61]:
for (cls in cl.classes) {
    posEvents[cls] = ArrayList()
    negEvents[cls] = ArrayList()

    for (pred in preds) {
        if (pred.probs[cls]!! > 0.5) {
            posEvents[cls]!!.addAll(pred.pos[cls]!!)
        } else {
            negEvents[cls]!!.addAll(pred.negs[cls]!!)
        }
    }
}

In [62]:
var posEventsCount = HashMap<String, Map<String, Int>>()
var negEventsCount = HashMap<String, Map<String, Int>>()

In [63]:
for (cls in cl.classes) {
    posEventsCount[cls] = posEvents[cls]!!.toList().groupBy { it }.mapValues { it.value.size }
    negEventsCount[cls] = negEvents[cls]!!.toList().groupBy { it }.mapValues { it.value.size }
    posEventsCount[cls] = posEventsCount[cls]!!.toSortedMap (compareBy ({ posEventsCount[cls]?.get(it)?.times(-1) }, { it }))
    negEventsCount[cls] = negEventsCount[cls]!!.toSortedMap (compareBy ({ negEventsCount[cls]?.get(it)?.times(-1) }, { it }))
}


In [64]:
posEventsCount["database"]

{ui.event_BreadcrumbShowTooltip_BreadcrumbShowTooltip=7, file.types.usage_edit_JAVA=5, toolwindow_activated_Database=5, actions_action.invoked_EditorCopy=4, ui.dialogs_close_com.intellij.openapi.progress.util.ProgressDialog$MyDialogWrapper=3, ui.dialogs_show_com.intellij.openapi.progress.util.ProgressDialog$MyDialogWrapper=3, actions_action.invoked_EditorLeft=2, actions_action.invoked_com.intellij.xdebugger.impl.breakpoints.RemoveBreakpointGutterIconAction=2, build_started_started=2, gutter.icon.click_clicked_db_verified_breakpoint=2, productivity_feature.used_db.table.editor=2, toolwindow_activated_Services=2, actions_action.invoked_CheckinFiles=1, actions_action.invoked_CloseAllEditors=1, actions_action.invoked_EditorDown=1, actions_action.invoked_EditorNextWordWithSelection=1, actions_action.invoked_EditorPreviousWordWithSelection=1, actions_action.invoked_EditorRight=1, actions_action.invoked_EditorUp=1, actions_action.invoked_Resume=1, toolbar_clicked_Resume=1, ui.dialogs_close_co

In [40]:
negEventsCount["database"]



In [65]:
val (preds_t, AUCs_t, ROCs_t) = cl.predict(test)

In [66]:
AUCs_t

[0.7010986820139206, 0.9212760229775477, 0.8367346938775511, 0.22758194186765615, 0.8554723704892446, 0.006802721088435374, 0.14569160997732428, 0.9521591245193728, 0.5212585034013606, 0.5068027210884354]

In [117]:
AUCs_t

[0.7010986820139206, 0.9288313686460966, 0.8367346938775511, 0.11054313272358385, 0.8561363435068248, 0.006802721088435374, 0.1290249433106576, 0.9217910848549944, 0.006802721088435374, 0.006802721088435374]

In [17]:
cl.classes

[reading, coding, ide_start, notifications, rdb, database, ide_close, vcs, terminal, settings]