In [1]:
%use kotlin-statistics, krangl, kravis, lets-plot

In [2]:
import kotlin.math.*

In [3]:
@file:Repository("https://dl.bintray.com/kyonifer/maven")
@file:DependsOn("com.kyonifer:koma-core-ejml:0.12")
@file:DependsOn("com.kyonifer:koma-plotting:0.12")

In [7]:
import plotly.kt.*

Unresolved reference: plotly

In [14]:
import koma.extensions.*
import koma.*

In [9]:
%use lets-plot

In [10]:
fun trainTestSplit(path: String, fraction: Double = 0.8): Pair<DataFrame, DataFrame> {
    val df = DataFrame.readTSV(path)
    val trainCount: Int = kotlin.math.ceil(fraction * df.nrow).toInt()
    val shuffledDf = df.shuffle()
    return Pair(shuffledDf.slice(0..trainCount), shuffledDf.slice(trainCount + 1..shuffledDf.nrow))
}

In [15]:
class BoW(private val limit: Int = 100) {

    public var voc: Map<String, Int> = HashMap()

    fun initialize(df: DataFrame) {
        val events = df["events"]
        val eventsMap: MutableMap<String, Int> = HashMap()
        for (i in 0 until events.length) {
            for (ev in events[i].toString().split(" , ")) {
                if (eventsMap.containsKey(ev)) {
                    eventsMap[ev] = eventsMap.getValue(ev) + 1
                } else {
                    eventsMap[ev] = 1
                }
            }
        }
        val realLimit = if (eventsMap.size > limit) limit else eventsMap.size
        voc = eventsMap.toSortedMap(compareBy({ eventsMap[it]?.times(-1) }, { it })).keys.toList()
            .slice(0 until realLimit).withIndex().toList().associate {it.value to it.index}
    }

    fun transform(df: DataFrame): Pair<koma.matrix.Matrix<Double>, List<String>> {
        val mat = zeros(df.nrow, voc.size + 2)
        val label = ArrayList<String>()
        for (i in 0 until df.nrow) {
            mat[i, voc.size] = df["ms"][i] as Int
            mat[i, voc.size + 1] = df["events"][i].toString().split(" , ").size
            label.add(df["Category"][i].toString())
            for (event in df["events"][i].toString().split(" , ")) {
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return Pair(mat, label)
    }
    
    fun transform2(df: DataFrame): koma.matrix.Matrix<Double> {
        val mat = zeros(df.nrow, voc.size + 2)
        for (i in 0 until df.nrow) {
            mat[i, voc.size] = df["ms"][i] as Int
            mat[i, voc.size + 1] = df["events"][i].toString().split(" , ").size
            for (event in df["events"][i].toString().split(" , ")) {
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return mat
    }

}

In [16]:
class NaiveBayesCounter(private val alpha: Double = 1e-5, val p: Double = 0.05) {

    private var classProb: MutableMap<String, Double> = HashMap()
    public var classes: MutableList<String> = ArrayList()
    private var probs: MutableMap<String, koma.matrix.Matrix<Double>> = HashMap()


    fun fit(X: koma.matrix.Matrix<Double>, y: List<String>) {
        for (i in y.indices) {
            for (cls in y[i].toLowerCase().split(", ")) {
                if (!classProb.containsKey(cls)) {
                    classProb[cls] = 1.0 / y.size
                    classes.add(cls)
                    probs[cls] = ones(X.shape()[1] - 2, 11) * alpha
                } else {
                    classProb[cls] = classProb.getValue(cls) + 1.0 / y.size
                }
                for (j in 0 until X.shape()[1] - 2) {
                    val ind:Int = if (X[i, j] < 10.0) X[i, j].toInt() else 10
                    probs.getOrDefault(cls, zeros(0, 0))[j, ind] += 1
                }
            }

        }
        for (cls in classes) {
            var probsMatrix = probs.getOrDefault(cls, zeros(0, 0))
            val den = probsMatrix.getRow(0).elementSum().toInt()
            probsMatrix /= den
            probs[cls] = probsMatrix
        }
    }




    fun logProb(X: koma.matrix.Matrix<Double>): koma.matrix.Matrix<Double> {
        var ans = zeros(X.shape()[0], classes.size)
        for (i in 0 until X.shape()[0]) {
            for (j in classes.indices) {
                val cls = classes[j]
                ans[i, j] = kotlin.math.ln(classProb.getOrDefault(cls, alpha))
                for (k in 0 until X.shape()[1] - 2) {
                    val ind = if (X[i, k] < 10.0) X[i, k].toInt() else 10
                    ans[i, j] += kotlin.math.ln(probs.getOrDefault(cls, zeros(0, 0))[k, ind])
                }
            }
        }
        val p = fill(ans.shape()[0], ans.shape()[1]) {i, j -> 1 / (exp(ans[i, 0..ans.shape()[1]-1] - ans[i, j]).mean() * ans.shape()[1]) }
        return p
    }

    fun predict(X: koma.matrix.Matrix<Double>): ArrayList<ArrayList<String>> {
        val probs = logProb(X)
        val ans = ArrayList<ArrayList<String>>()
        
        for (i in 0 until probs.shape()[0]) {
            ans.add(ArrayList<String>())
        }
        probs.forEachIndexed { row, col, ele -> if (ele > p) {ans[row].add(classes[col])}}
        return ans
    }
}

In [17]:
val (train, test) = trainTestSplit("../data/test/test_data.tsv")

In [18]:
// val (train, test) = trainTestSplit("../data/sh_sessions_multi.tsv")

In [19]:
fun testPrecisionRecall(cat: String="coding", p: Double=0.05): Triple<ArrayList<Double>, ArrayList<Double>, ArrayList<Int>> {
    val rec = ArrayList<Double>()
    val prec = ArrayList<Double>()
    val x = ArrayList<Int>()
    for (i in 10 until 450 step 10) {
        val bow = BoW(i)
        bow.initialize(train)

        val (X_bow, y_bow) = bow.transform(train)
        val (X_bow_test, y_bow_test) = bow.transform(test)
        val classificator = NaiveBayesCounter(1e-4, p)
        classificator.fit(X_bow, y_bow)

        val ans = classificator.predict(X_bow_test)
        var TP = 0.0
        var FN = 0.0
        var FP = 0.0

        for(i in y_bow_test.indices) {

            if (y_bow_test[i].toLowerCase().contains(cat) && ans[i].joinToString(", ").contains(cat)) {
                TP += 1
            }

            if (y_bow_test[i].toLowerCase().contains(cat) && !ans[i].joinToString(", ").contains(cat)) {
                FN += 1
            }

            if (!y_bow_test[i].toLowerCase().contains(cat) && ans[i].joinToString(", ").contains(cat)) {
                FP += 1
            }
        }
        rec.add(TP / (TP + FN))
        prec.add(TP / (TP + FP))


        x.add(i)
    }
    return Triple(rec, prec, x)
}

In [20]:
val res = ArrayList<Double>()
val x = ArrayList<Int>()

In [21]:
val bow = BoW(400)
bow.initialize(train)

val (X_bow, y_bow) = bow.transform(train)
val (X_bow_test, y_bow_test) = bow.transform(test)
val classificator = NaiveBayesCounter(1e-4, p=0.1)
classificator.fit(X_bow, y_bow)

In [22]:
classificator.classes

[reading, vcs, coding, notifications, rdb, ide_close, ide_start, terminal, database, settings]

In [23]:
val ans = classificator.predict(X_bow_test)

In [24]:
fun accInd(classes: MutableList<String>, cor:List<String>, pred: ArrayList<ArrayList<String>>): Pair<HashMap<String, ArrayList<Int>>, HashMap<String, ArrayList<Int>>> {
    var rec = HashMap<String, ArrayList<Int>>()
    var prec = HashMap<String, ArrayList<Int>>()
    for (cls in classes) {
        rec[cls] = ArrayList<Int>()
        prec[cls] = ArrayList<Int>()
        for (i in cor.indices) {
            if (cor[i].toLowerCase().contains(cls) && (cls !in pred[i]))
                rec.getOrDefault(cls, ArrayList()).add(i)
            if (!cor[i].toLowerCase().contains(cls) && (cls in pred[i]))
                prec.getOrDefault(cls, ArrayList()).add(i)
        }
    }
    return Pair(rec, prec)
}

In [187]:
classificator.classes

[run_build_debug, coding, reading, ide_start, vcs, notifications, settings, terminal, ide_close, database]

In [159]:
val (rec, prec) = accInd(classificator.classes, y_bow_test, ans)

In [160]:
var rows = test.row(prec["coding"]!!.get(0)).values

In [161]:
for (i in prec["coding"]!!.indices)
    if (i!=0)
        rows = rows + test.row(prec["coding"]!!.get(i)).values

for (i in rec["coding"]!!.indices)
    rows = rows + test.row(rec["coding"]!!.get(i)).values

In [162]:
val recall_coding = dataFrameOf("session", "ms", "Category", "events") (rows)

In [163]:
val pred = ArrayList<String>()

In [164]:
for (i in prec["coding"]!!.indices)
    pred.add(ans[prec["coding"]!!.get(i)].joinToString())

for (i in rec["coding"]!!.indices)
    pred.add(ans[rec["coding"]!!.get(i)].joinToString())

In [165]:
val res = recall_coding.addColumn("pred") {pred}

In [166]:
res

session,ms,Category,events,pred
210419257eb7736-6b78-47dc-b51e-068eafe5d64f_0,34074,reading,"file.types.usage_open_JAVA , file.types.usage_o...","coding, reading, notifications"
2711192b5318805-2ea5-4bcf-9e4d-e051dfae72e7_47,22050,Reading,"actions_action.invoked_FindInPath , find_search...",coding
3012191508d29a0-5595-492b-bd26-c9782c642489_23,23264,run_build_debug,"actions_action.invoked_Resume , ui.event_Breadc...","run_build_debug, coding"
301219181803164-a2d5-4101-9b92-eee082e839a4_94,27444,reading,"productivity_feature.used_editing.select.word ,...",coding
17022025dd24206-c7a5-4c74-84f2-bca97f1ee54b_19,51550,"reading, run_build_debug","actions_action.invoked_ReplaceInPath , find_sea...",coding
18021914fb66f2e-7401-47d9-ba3d-82355f4fb4ff_5,9638,coding,"actions_action.invoked_Replace , find_search.se...",reading
0303201c3f6c427-233f-4f51-b0d2-7705e5ce1c0c_20,39939,"coding, run_build_debug","completion_finished_finished , completion_finis...",run_build_debug
07022029dbfaf58-a978-4537-a29a-d1fd085c301a_11,19857,"Coding, run_build_debug","file.types.usage_edit_JAVA , actions_action.inv...",run_build_debug
1607191e9e88921-2f6b-4f73-8d24-721dc1efd7e4_41,84248,"Coding, run_build_debug","file.types.usage_edit_JAVA , productivity_featu...",run_build_debug
1106192faddd4b1-126e-4113-bf5d-ea7c76c50819_83,44639,"coding, run_build_debug","file.types.usage_open_JAVA , actions_action.inv...",run_build_debug


In [12]:
for (i in 10 until 400 step 10) {
    val bow = BoW(i)
    bow.initialize(train)

    val (X_bow, y_bow) = bow.transform(train)
    val (X_bow_test, y_bow_test) = bow.transform(test)
    val classificator = NaiveBayesCounter(1e-4)
    classificator.fit(X_bow, y_bow)

    val ans = classificator.predict(X_bow_test)
    var tot = 0.0
    var TP = 0.0
    var FN = 0.0
    var FP = 0.0

    for(i in y_bow_test.indices) {
        if (y_bow_test[i].toLowerCase().contains(ans[i])) {
            tot += 1
        }
    }
    res.add(tot / y_bow_test.size)
    x.add(i)
}

Variable 'TP' is never used
Variable 'FN' is never used
Variable 'FP' is never used
Name shadowed: i
None of the following functions can be called with the arguments supplied: 
public operator fun CharSequence.contains(char: Char, ignoreCase: Boolean = ...): Boolean defined in kotlin.text
public operator fun CharSequence.contains(other: CharSequence, ignoreCase: Boolean = ...): Boolean defined in kotlin.text
public inline operator fun CharSequence.contains(regex: Regex): Boolean defined in kotlin.text

In [34]:
var (rec, prec, x) = testPrecisionRecall("coding")

In [72]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [78]:
var (rec, prec, x) = testPrecisionRecall("ide_start", 0.1)

In [79]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [80]:
var (rec, prec, x) = testPrecisionRecall("ide_close", 0.1)

In [81]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [82]:
var (rec, prec, x) = testPrecisionRecall("vcs")

In [83]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [84]:
var (rec, prec, x) = testPrecisionRecall("reading")

In [85]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [86]:
var (rec, prec, x) = testPrecisionRecall("rdb", 0.1)

In [87]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [82]:
val test_ans = test.addColumn("pred") {ans}

In [87]:
test_ans.row(1)

{session_id=24022011a9051c2-3d2d-4a1a-8fea-4455e9cc25d9_15, ms=47125, Category=reading, coding, events=actions_action.invoked_SearchEverywhere , productivity_feature.used_SearchEverywhere , searchEverywhere_dialogOpen_dialogOpen , actions_custom.action.invoked_DoubleShortcut , actions_action.invoked_com.intellij.ui.ScrollingUtil$6 , actions_action.invoked_com.intellij.ui.ScrollingUtil$6 , actions_action.invoked_com.intellij.ui.ScrollingUtil$6 , actions_action.invoked_com.intellij.ui.ScrollingUtil$7 , actions_action.invoked_com.intellij.ui.ScrollingUtil$6 , actions_action.invoked_com.intellij.ui.ScrollingUtil$7 , actions_action.invoked_com.intellij.ui.ScrollingUtil$6 , actions_action.invoked_com.intellij.ui.ScrollingUtil$6 , actions_action.invoked_com.intellij.ui.ScrollingUtil$6 , actions_action.invoked_com.intellij.ui.ScrollingUtil$7 , actions_action.invoked_com.intellij.ui.ScrollingUtil$7 , actions_action.invoked_com.intellij.ui.ScrollingUtil$7 , actions_action.invoked_com.intellij.ui

In [84]:
test_ans.filter {it["category"].isMatching{} }

Unresolved reference. None of the following candidates is applicable because of receiver type mismatch: 
public inline operator fun <T : Any, R : Iterable<String>> ???.contains(element: String?): Boolean where R : ClosedRange<String> defined in kotlin.ranges
public operator fun <T> Array<out String>.contains(element: String): Boolean defined in kotlin.collections
public operator fun BooleanArray.contains(element: Boolean): Boolean defined in kotlin.collections
public operator fun ByteArray.contains(element: Byte): Boolean defined in kotlin.collections
public operator fun CharArray.contains(element: Char): Boolean defined in kotlin.collections
public operator fun CharSequence.contains(char: Char, ignoreCase: Boolean = ...): Boolean defined in kotlin.text
public operator fun CharSequence.contains(other: CharSequence, ignoreCase: Boolean = ...): Boolean defined in kotlin.text
public inline operator fun CharSequence.contains(regex: Regex): Boolean defined in kotlin.text
public operator fun