In [1]:
%use kotlin-statistics, krangl, kravis, lets-plot

In [2]:
import kotlin.math.*

In [3]:
@file:Repository("https://dl.bintray.com/kyonifer/maven")
@file:DependsOn("com.kyonifer:koma-core-ejml:0.12")
@file:DependsOn("com.kyonifer:koma-plotting:0.12")

In [4]:
import koma.extensions.*
import koma.*

In [5]:
fun trainTestSplit(path: String, fraction: Double = 0.8): Pair<DataFrame, DataFrame> {
    val df = DataFrame.readTSV(path)
    val trainCount: Int = kotlin.math.ceil(fraction * df.nrow).toInt()
    val shuffledDf = df.shuffle()
    return Pair(shuffledDf.slice(0..trainCount), shuffledDf.slice(trainCount + 1..shuffledDf.nrow))
}

In [142]:
class BoW(private val limit: Int = 5, private val n: Int = 1) {

    public var voc: Map<String, Int> = HashMap()
    public var eventsMap: MutableMap<String, Int> = HashMap()
    
    fun initialize(df: DataFrame) {
        val events = df["events"]
        for (i in 0 until events.length) {
            val events_arr = df["events"][i].toString().split(" , ")
            for (j in 0 until events_arr.size - n + 1) {
                val ev = events_arr.slice(j..j + n - 1).joinToString()
                if (eventsMap.containsKey(ev)) {
                    eventsMap[ev] = eventsMap.getValue(ev) + 1
                } else {
                    eventsMap[ev] = 1
                }
            }
        }
        
//         val realLimit = if (eventsMap.size > limit) limit else eventsMap.size
//         voc = eventsMap.toSortedMap(compareBy({ eventsMap[it]?.times(-1) }, { it })).keys.toList()
//             .slice(0 until realLimit).withIndex().toList().associate {it.value to it.index}
        voc = eventsMap.filter { it.value > limit}.toSortedMap(compareBy({ eventsMap[it]?.times(-1) }, { it })).keys
                .toList().withIndex().toList().associate {it.value to it.index}
    }

    fun transform(df: DataFrame): Pair<koma.matrix.Matrix<Double>, List<String>> {
        val mat = zeros(df.nrow, voc.size + 2)
        val label = ArrayList<String>()
        for (i in 0 until df.nrow) {
            mat[i, voc.size] = df["ms"][i] as Int
            mat[i, voc.size + 1] = df["events"][i].toString().split(" , ").size
            label.add(df["Category"][i].toString())
            
            val events_arr = df["events"][i].toString().split(" , ")
            for (j in 0 until events_arr.size - n + 1) {
                val event = events_arr.slice(j..j + n - 1).joinToString()
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return Pair(mat, label)
    }
    
    
//     no n-gram support
    fun transform2(df: DataFrame): koma.matrix.Matrix<Double> {
        val mat = zeros(df.nrow, voc.size + 2)
        for (i in 0 until df.nrow) {
            mat[i, voc.size] = df["ms"][i] as Int
            mat[i, voc.size + 1] = df["events"][i].toString().split(" , ").size
            for (event in df["events"][i].toString().split(" , ")) {
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return mat
    }

}

In [144]:
class NaiveBayesCounter(private val alpha: Double = 1e-5, val p: Double = 0.05) {

    private var classProb: MutableMap<String, Double> = HashMap()
    public var classes: MutableList<String> = ArrayList()
    private var probs: MutableMap<String, koma.matrix.Matrix<Double>> = HashMap()


    fun fit(X: koma.matrix.Matrix<Double>, y: List<String>) {
        for (i in y.indices) {
            for (cls in y[i].toLowerCase().split(", ")) {
                if (!classProb.containsKey(cls)) {
                    classProb[cls] = 1.0 / y.size
                    classes.add(cls)
                    probs[cls] = ones(X.shape()[1] - 2, 11) * alpha
                } else {
                    classProb[cls] = classProb.getValue(cls) + 1.0 / y.size
                }
                for (j in 0 until X.shape()[1] - 2) {
                    val ind:Int = if (X[i, j] < 10.0) X[i, j].toInt() else 10
                    probs.getOrDefault(cls, zeros(0, 0))[j, ind] += 1
                }
            }

        }
        for (cls in classes) {
            var probsMatrix = probs.getOrDefault(cls, zeros(0, 0))
            val den = probsMatrix.getRow(0).elementSum().toInt()
            probsMatrix /= den
            probs[cls] = probsMatrix
        }
    }




    fun logProb(X: koma.matrix.Matrix<Double>): koma.matrix.Matrix<Double> {
        var ans = zeros(X.shape()[0], classes.size)
        for (i in 0 until X.shape()[0]) {
            for (j in classes.indices) {
                val cls = classes[j]
                ans[i, j] = kotlin.math.ln(classProb.getOrDefault(cls, alpha))
                for (k in 0 until X.shape()[1] - 2) {
                    val ind = if (X[i, k] < 10.0) X[i, k].toInt() else 10
                    ans[i, j] += kotlin.math.ln(probs.getOrDefault(cls, zeros(0, 0))[k, ind])
                }
            }
        }
        val p = fill(ans.shape()[0], ans.shape()[1]) {i, j -> 1 / (exp(ans[i, 0..ans.shape()[1]-1] - ans[i, j]).mean() * ans.shape()[1]) }
        return p
    }

    fun predict(X: koma.matrix.Matrix<Double>): ArrayList<ArrayList<String>> {
        val probs = logProb(X)
        val ans = ArrayList<ArrayList<String>>()
        
        for (i in 0 until probs.shape()[0]) {
            ans.add(ArrayList<String>())
        }
        probs.forEachIndexed { row, col, ele -> if (ele > p) {ans[row].add(classes[col])}}
        return ans
    }
}

In [154]:
val (train, test) = trainTestSplit("../data/test/test_data.tsv")

In [156]:
val bow = BoW(400, 1)
bow.initialize(train)


In [157]:
val (X_bow, y_bow) = bow.transform(train)

In [158]:
val classificator = NaiveBayesCounter(1e-4, 0.1)
classificator.fit(X_bow, y_bow)

In [162]:
classificator.classes[0]

reading

In [118]:
fun testPrecisionRecall(cat: String="coding", p: Double=0.1, top: Int=1000, n: Int=1): Triple<ArrayList<Double>, ArrayList<Double>, ArrayList<Int>> {
    val rec = ArrayList<Double>()
    val prec = ArrayList<Double>()
    val x = ArrayList<Int>()
    for (i in 10 until top step 10) {
        val bow = BoW(i, n)
        bow.initialize(train)

        val (X_bow, y_bow) = bow.transform(train)
        val (X_bow_test, y_bow_test) = bow.transform(test)
        val classificator = NaiveBayesCounter(1e-4, p)
        classificator.fit(X_bow, y_bow)

        val ans = classificator.predict(X_bow)
        var TP = 0.0
        var FN = 0.0
        var FP = 0.0

        for(i in y_bow.indices) {

            if (y_bow[i].toLowerCase().contains(cat) && ans[i].joinToString(", ").contains(cat)) {
                TP += 1
            }

            if (y_bow[i].toLowerCase().contains(cat) && !ans[i].joinToString(", ").contains(cat)) {
                FN += 1
            }

            if (!y_bow[i].toLowerCase().contains(cat) && ans[i].joinToString(", ").contains(cat)) {
                FP += 1
            }
        }
        rec.add(TP / (TP + FN))
        prec.add(TP / (TP + FP))


        x.add(i)
    }
    return Triple(rec, prec, x)
}

In [108]:
var top = 1500
var n = 2

In [119]:
var (rec, prec, x) = testPrecisionRecall("coding", top=top, n=n)

In [120]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [148]:
classi

Unresolved reference: classes

In [121]:
var (rec, prec, x) = testPrecisionRecall("reading", top=top, n=n)

In [122]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [83]:
var (rec, prec, x) = testPrecisionRecall("rdb", top=top, n=n)

In [84]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [86]:
var (rec, prec, x) = testPrecisionRecall("vcs", top=top, n=n)

In [87]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [89]:
var (rec, prec, x) = testPrecisionRecall("ide_start", top=top, n=n)

In [90]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [92]:
var (rec, prec, x) = testPrecisionRecall("ide_close", top=top, n=n)

In [93]:
val df  = mapOf("x" to x, "y" to prec)
val df2 = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line(color="red") + geom_line(data=df2, color="blue")

In [82]:
val df = mapOf<String, Any>("x" to x, "y" to rec )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [85]:
val df = mapOf<String, Any>("x" to x, "y" to rec )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [94]:
val df = mapOf<String, Any>("x" to x, "y" to rec )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [91]:
val df = mapOf<String, Any>("x" to x, "y" to rec )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [88]:
val df = mapOf<String, Any>("x" to x, "y" to rec )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()