In [1]:
%use kotlin-statistics, krangl, kravis, lets-plot

In [2]:
import kotlin.math.*

In [3]:
@file:Repository("https://dl.bintray.com/kyonifer/maven")
@file:DependsOn("com.kyonifer:koma-core-ejml:0.12")
@file:DependsOn("com.kyonifer:koma-plotting:0.12")

In [4]:
import koma.extensions.*
import koma.*

In [5]:
%use lets-plot

In [5]:
fun trainTestSplit(path: String, fraction: Double = 0.8): Pair<DataFrame, DataFrame> {
    val df = DataFrame.readTSV(path)
    val trainCount: Int = kotlin.math.ceil(fraction * df.nrow).toInt()
    val shuffledDf = df.shuffle()
    return Pair(shuffledDf.slice(0..trainCount), shuffledDf.slice(trainCount + 1..shuffledDf.nrow))
}

In [6]:
class BoW(private val limit: Int = 100) {

    private var voc: Map<String, Int> = HashMap()

    fun initialize(df: DataFrame) {
        val events = df["events"]
        val eventsMap: MutableMap<String, Int> = HashMap()
        for (i in 0 until events.length) {
            for (ev in events[i].toString().split(" , ")) {
                if (eventsMap.containsKey(ev)) {
                    eventsMap[ev] = eventsMap.getValue(ev) + 1
                } else {
                    eventsMap[ev] = 1
                }
            }
        }
        val realLimit = if (eventsMap.size > limit) limit else eventsMap.size
        voc = eventsMap.toSortedMap(compareBy({ eventsMap[it]?.times(-1) }, { it })).keys.toList()
            .slice(0 until realLimit).withIndex().toList().associate {it.value to it.index}
    }

    fun transform(df: DataFrame): Pair<koma.matrix.Matrix<Double>, List<String>> {
        val mat = zeros(df.nrow, voc.size + 2)
        val label = ArrayList<String>()
        for (i in 0 until df.nrow) {
            mat[i, voc.size] = df["min"][i] as Int
            mat[i, voc.size + 1] = df["events"][i].toString().split(" , ").size
            label.add(df["category"][i].toString())
            for (event in df["events"][i].toString().split(" , ")) {
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return Pair(mat, label)
    }

}

In [7]:
class NaiveBayes(private val alpha:Double = 1e-5) {

    private var classProb: MutableMap<String, Double> = HashMap()
    private var classes: MutableList<String> = ArrayList()
    private var probs: MutableMap<String, DoubleArray> = HashMap()



    fun fit(X: koma.matrix.Matrix<Double>, y: List<String>) {
        for (i in y.indices) {
            for (cls in y[i].toLowerCase().split(", ")) {
                if (!classProb.containsKey(cls)) {
                    classProb[cls] = 1.0 / y.size
                    classes.add(cls)
                    probs[cls] = DoubleArray(X.shape()[1] - 2) {alpha}
                } else {
                    classProb[cls] = classProb.getValue(cls) + 1.0 / y.size
                }
                for (j in 0 until X.shape()[1] - 2) {
                    probs.getOrDefault(cls, DoubleArray(0))[j] += X[i, j]
                }
            }

        }
        for (cls in classes) {
            val den = probs.getOrDefault(cls, DoubleArray(0)).sum()
            for (j in 0 until X.shape()[1] - 2) {
                probs.getOrDefault(cls, DoubleArray(0))[j] /= den
            }
        }
    }

    fun logProb(X: koma.matrix.Matrix<Double>): koma.matrix.Matrix<Double> {
        val ans = zeros(X.shape()[0], classes.size)
        for (i in 0 until X.shape()[0]) {
            for (j in classes.indices) {
                val cls = classes[j]
                ans[i, j] = kotlin.math.ln(classProb.getOrDefault(cls, alpha))
                for (k in 0 until X.shape()[1] - 2) {
                    ans[i, j] += X[i, k] * kotlin.math.ln(probs.getOrDefault(cls, DoubleArray(0))[k])
                }
            }
        }
        return ans
    }

    fun predict(X: koma.matrix.Matrix<Double>): List<String> {
        val probs = logProb(X)
        val ans = ArrayList<String>()

        probs.forEachRow { ans.add(classes[it.argMax()]) }
        return ans
    }
}

In [13]:
val (train, test) = trainTestSplit("../data/sh_sessions_multi.tsv")

In [14]:
fun testPrecisionRecall(cat: String="coding"): Triple<ArrayList<Double>, ArrayList<Double>, ArrayList<Int>> {
    val rec = ArrayList<Double>()
    val prec = ArrayList<Double>()
    val x = ArrayList<Int>()
    for (i in 10 until 450 step 10) {
        val bow = BoW(i)
        bow.initialize(train)

        val (X_bow, y_bow) = bow.transform(train)
        val (X_bow_test, y_bow_test) = bow.transform(test)
        val classificator = NaiveBayes(1e-4)
        classificator.fit(X_bow, y_bow)

        val ans = classificator.predict(X_bow_test)
        var TP = 0.0
        var FN = 0.0
        var FP = 0.0

        for(i in y_bow_test.indices) {

            if (y_bow_test[i].toLowerCase().contains(cat) && ans[i].contains(cat)) {
                TP += 1
            }

            if (y_bow_test[i].toLowerCase().contains(cat) && !ans[i].contains(cat)) {
                FN += 1
            }

            if (!y_bow_test[i].toLowerCase().contains(cat) && ans[i].contains(cat)) {
                FP += 1
            }
        }
        rec.add(TP / (TP + FN))
        prec.add(TP / (TP + FP))


        x.add(i)
    }
    return Triple(rec, prec, x)
}

In [15]:
val res = ArrayList<Double>()
val x = ArrayList<Int>()

In [16]:
for (i in 10 until 450 step 10) {
    val bow = BoW(i)
    bow.initialize(train)

    val (X_bow, y_bow) = bow.transform(train)
    val (X_bow_test, y_bow_test) = bow.transform(test)
    val classificator = NaiveBayes(1e-4)
    classificator.fit(X_bow, y_bow)

    val ans = classificator.predict(X_bow_test)
    var tot = 0.0
    var TP = 0.0
    var FN = 0.0
    var FP = 0.0

    for(i in y_bow_test.indices) {
        if (y_bow_test[i].toLowerCase().contains(ans[i])) {
            tot += 1
        }
    }
    res.add(tot / y_bow_test.size)
    x.add(i)
}

In [17]:
val df = mapOf("x" to x, "y" to res )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [18]:
var (rec, prec, x) = testPrecisionRecall("coding")

In [19]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [20]:
val df = mapOf<String, Any>("x" to x, "y" to rec )
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [21]:
var (rec, prec, x) = testPrecisionRecall("ide start")

In [22]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [23]:
val df = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [24]:
var (rec, prec, x) = testPrecisionRecall("ide close")

In [25]:
val df = mapOf("x" to x, "y" to rec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()

In [26]:
val df = mapOf("x" to x, "y" to prec)
val p = lets_plot(df) {x = "x"; y = "y"}
p + geom_line()