In [1]:
%use kotlin-statistics, krangl, kravis

In [2]:
import kotlin.math.*

In [3]:
@file:Repository("https://dl.bintray.com/kyonifer/maven")
@file:DependsOn("com.kyonifer:koma-core-ejml:0.12")
@file:DependsOn("com.kyonifer:koma-plotting:0.12")

In [4]:
import koma.extensions.*
import koma.*

In [5]:
%use lets-plot

In [6]:
fun trainTestSplit(path: String, fraction: Double = 0.8): Pair<DataFrame, DataFrame> {
    val df = DataFrame.readTSV(path)
    val trainCount: Int = kotlin.math.ceil(fraction * df.nrow).toInt()
    val shuffledDf = df.shuffle()
    return Pair(shuffledDf.slice(0..trainCount), shuffledDf.slice(trainCount + 1..shuffledDf.nrow))
}

In [7]:
class BoW(private val limit: Int = 100) {

    public var voc: Map<String, Int> = HashMap()

    fun initialize(df: DataFrame) {
        val events = df["events"]
        val eventsMap: MutableMap<String, Int> = HashMap()
        for (i in 0 until events.length) {
            for (ev in events[i].toString().split(" , ")) {
                if (eventsMap.containsKey(ev)) {
                    eventsMap[ev] = eventsMap.getValue(ev) + 1
                } else {
                    eventsMap[ev] = 1
                }
            }
        }
        val realLimit = if (eventsMap.size > limit) limit else eventsMap.size
        voc = eventsMap.toSortedMap(compareBy({ eventsMap[it]?.times(-1) }, { it })).keys.toList()
            .slice(0 until realLimit).withIndex().toList().associate {it.value to it.index}
    }

    fun transform(df: DataFrame): Pair<koma.matrix.Matrix<Double>, List<String>> {
        val mat = zeros(df.nrow, voc.size + 2)
        val label = ArrayList<String>()
        for (i in 0 until df.nrow) {
            mat[i, voc.size] = df["min"][i] as Int
            mat[i, voc.size + 1] = df["events"][i].toString().split(" , ").size
            label.add(df["category"][i].toString())
            for (event in df["events"][i].toString().split(" , ")) {
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return Pair(mat, label)
    }
    
    fun transform2(df: DataFrame): koma.matrix.Matrix<Double> {
        val mat = zeros(df.nrow, voc.size + 2)
        for (i in 0 until df.nrow) {
            mat[i, voc.size] = df["min"][i] as Int
            mat[i, voc.size + 1] = df["events"][i].toString().split(" , ").size
            for (event in df["events"][i].toString().split(" , ")) {
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return mat
    }

}

In [8]:
class NaiveBayesCounter(private val alpha: Double = 1e-5, val p: Double = 0.05) {

    private var classProb: MutableMap<String, Double> = HashMap()
    private var classes: MutableList<String> = ArrayList()
    private var probs: MutableMap<String, koma.matrix.Matrix<Double>> = HashMap()


    fun fit(X: koma.matrix.Matrix<Double>, y: List<String>) {
        for (i in y.indices) {
            for (cls in y[i].toLowerCase().split(", ")) {
                if (!classProb.containsKey(cls)) {
                    classProb[cls] = 1.0 / y.size
                    classes.add(cls)
                    probs[cls] = ones(X.shape()[1] - 2, 11) * alpha
                } else {
                    classProb[cls] = classProb.getValue(cls) + 1.0 / y.size
                }
                for (j in 0 until X.shape()[1] - 2) {
                    val ind:Int = if (X[i, j] < 10.0) X[i, j].toInt() else 10
                    probs.getOrDefault(cls, zeros(0, 0))[j, ind] += 1
                }
            }

        }
        for (cls in classes) {
            var probsMatrix = probs.getOrDefault(cls, zeros(0, 0))
            val den = probsMatrix.getRow(0).elementSum().toInt()
            probsMatrix /= den
            probs[cls] = probsMatrix
        }
    }




    fun logProb(X: koma.matrix.Matrix<Double>): koma.matrix.Matrix<Double> {
        var ans = zeros(X.shape()[0], classes.size)
        for (i in 0 until X.shape()[0]) {
            for (j in classes.indices) {
                val cls = classes[j]
                ans[i, j] = kotlin.math.ln(classProb.getOrDefault(cls, alpha))
                for (k in 0 until X.shape()[1] - 2) {
                    val ind = if (X[i, k] < 10.0) X[i, k].toInt() else 10
                    ans[i, j] += kotlin.math.ln(probs.getOrDefault(cls, zeros(0, 0))[k, ind])
                }
            }
        }
        val p = fill(ans.shape()[0], ans.shape()[1]) {i, j -> 1 / (exp(ans[i, 0..ans.shape()[1]-1] - ans[i, j]).mean() * ans.shape()[1]) }
        return p
    }

    fun predict(X: koma.matrix.Matrix<Double>): ArrayList<ArrayList<String>> {
        val probs = logProb(X)
        val ans = ArrayList<ArrayList<String>>()
        
        for (i in 0 until probs.shape()[0]) {
            ans.add(ArrayList<String>())
        }
        probs.forEachIndexed { row, col, ele -> if (ele > p) {ans[row].add(classes[col])}}
        return ans
    }
}

In [47]:
val (train, test) = trainTestSplit("../data/sh_sessions_multi.tsv")

In [10]:
fun testPrecisionRecall(cat: String="coding", p: Double=0.05): Triple<ArrayList<Double>, ArrayList<Double>, ArrayList<Int>> {
    val rec = ArrayList<Double>()
    val prec = ArrayList<Double>()
    val x = ArrayList<Int>()
    for (i in 10 until 450 step 10) {
        val bow = BoW(i)
        bow.initialize(train)

        val (X_bow, y_bow) = bow.transform(train)
        val (X_bow_test, y_bow_test) = bow.transform(test)
        val classificator = NaiveBayesCounter(1e-4, p)
        classificator.fit(X_bow, y_bow)

        val ans = classificator.predict(X_bow_test)
        var TP = 0.0
        var FN = 0.0
        var FP = 0.0

        for(i in y_bow_test.indices) {

            if (y_bow_test[i].toLowerCase().contains(cat) && ans[i].joinToString(", ").contains(cat)) {
                TP += 1
            }

            if (y_bow_test[i].toLowerCase().contains(cat) && !ans[i].joinToString(", ").contains(cat)) {
                FN += 1
            }

            if (!y_bow_test[i].toLowerCase().contains(cat) && ans[i].joinToString(", ").contains(cat)) {
                FP += 1
            }
        }
        rec.add(TP / (TP + FN))
        prec.add(TP / (TP + FP))


        x.add(i)
    }
    return Triple(rec, prec, x)
}

In [12]:
val res = ArrayList<Double>()
val x = ArrayList<Int>()

In [144]:
val bow = BoW(50)
bow.initialize(train)

val (X_bow, y_bow) = bow.transform(train)
val (X_bow_test, y_bow_test) = bow.transform(test)
val classificator = NaiveBayesCounter(1e-4)
classificator.fit(X_bow, y_bow)

In [13]:
for (i in 10 until 400 step 10) {
    val bow = BoW(i)
    bow.initialize(train)

    val (X_bow, y_bow) = bow.transform(train)
    val (X_bow_test, y_bow_test) = bow.transform(test)
    val classificator = NaiveBayesCounter(1e-4)
    classificator.fit(X_bow, y_bow)

    val ans = classificator.predict(X_bow_test)
    var tot = 0.0
    var TP = 0.0
    var FN = 0.0
    var FP = 0.0

    for(i in y_bow_test.indices) {
        if (y_bow_test[i].toLowerCase().contains(ans[i])) {
            tot += 1
        }
    }
    res.add(tot / y_bow_test.size)
    x.add(i)
}

In [116]:
val test = DataFrame.readTSV("../data/test_data2.tsv")
val train = DataFrame.readTSV("../data/sh_sessions_multi.tsv")

In [117]:
val bow = BoW(300)
bow.initialize(train)

In [118]:
val (X_bow, y_bow) = bow.transform(train)

In [119]:
val X_bow_test = bow.transform2(test)

In [120]:
val classificator = NaiveBayesCounter(1e-4)
classificator.fit(X_bow, y_bow)

val ans = classificator.predict(X_bow_test)

In [121]:
val cats = ArrayList<String>()

In [122]:
for (a in ans) {
    cats.add(a.joinToString(", "))
}

In [123]:
val catsCol = StringCol("category", cats)

In [124]:
val real = DataFrame.readTSV("../data/test_data.tsv")

In [125]:
val res = real.addColumn("category") {catsCol}

In [102]:
import java.io.File


In [126]:
res.writeTSV(File("../data/test_data.tsv"))

In [None]:
val cat = cl

In [51]:
val test = DataFrame.readTSV("../data/test.tsv")
val train = DataFrame.readTSV("../data/sh_sessions_multi.tsv")

In [73]:
train

session_id,start_time,sec,min,category,events
1305192621b4643-a1d4-4288-9cb2-9ece426a01f1_1,26.08.2019 10:28,164,2,"Coding, VCS","actions_Find , actions_com.intellij.find.Search..."
0104192bd9b36a1-98c7-4a5e-817d-313d69be0ba3_1,30.08.2019 20:16,66,1,Coding,"actions_FindInPath , toolwindow_Find , ui.dialo..."
0304192f4244c10-88fc-4a38-992b-28bf8ffcfb58_0,27.08.2019 6:18,11,0,"Ide start, Ide close","event.log_whitelist.loaded , lifecycle_ide.star..."
0905192c46ec682-3f47-4af1-b313-755dbfcfe1fa_4,27.08.2019 11:24,748,12,Coding,"toolwindow_Project , file.types.usage_JAVA , ac..."
1404192054d7ef6-857b-4fd3-9084-333d5eb67b73_27,30.08.2019 10:04,489,8,Coding,"productivity_SearchEverywhere , searchEverywher..."
0204192d92e01a5-cec3-4b1c-9b0c-d75793d8ce90_2,27.08.2019 16:45,512,8,Coding,"searchEverywhere_dialogOpen , actions_GotoClass..."
0404192fc756d0d-7c0c-493e-918d-ef51209587de_13,28.08.2019 22:14,636,10,RD,"actions_Run , toolbar_Run , run.configuration.e..."
03041910fd6df81-f137-4d92-911a-b6a839e13319_24,28.08.2019 8:01,11,0,Coding,"file.types.usage_YAML , actions_EditorCopy , ac..."
05051912837b479-c105-4077-8d4a-cb2c7811a592_13,30.08.2019 0:37,619,10,"Ide start, rd",ui.dialogs_com.intellij.openapi.progress.util.P...
020419340f7e4b9-195f-4d29-9e99-9619bf5efbf4_10,28.08.2019 11:54,221,3,"Coding, VCS","lifecycle_project.closed , toolwindow_sbt-shell..."


In [14]:
val df = mapOf("x" to x, "y" to res )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [48]:
var (rec, prec, x) = testPrecisionRecall("coding")

In [49]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [50]:
val df = mapOf<String, Any>("x" to x, "y" to rec )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [51]:
var (rec, prec, x) = testPrecisionRecall("ide start", 0.05)

In [52]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [53]:
val df = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [54]:
var (rec, prec, x) = testPrecisionRecall("ide close", 0.1)

In [55]:
val df = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [56]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [57]:
var (rec, prec, x) = testPrecisionRecall("vcs")

In [58]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [59]:
val df = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [60]:
var (rec, prec, x) = testPrecisionRecall("terminal")

In [61]:
val df = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [62]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [63]:
var (rec, prec, x) = testPrecisionRecall("db", 0.1)

In [64]:
val df = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [65]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [67]:
var (rec, prec, x) = testPrecisionRecall("rd", 0.1)

In [68]:
val df = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [69]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()