In [1]:
%use kotlin-statistics, krangl, kravis, lets-plot

In [2]:
import kotlin.math.*

In [3]:
@file:Repository("https://dl.bintray.com/kyonifer/maven")
@file:DependsOn("com.kyonifer:koma-core-ejml:0.12")
@file:DependsOn("com.kyonifer:koma-plotting:0.12")

In [4]:
import koma.extensions.*
import koma.*

In [5]:
fun trainTestSplit(path: String, fraction: Double = 0.8): Pair<DataFrame, DataFrame> {
    val df = DataFrame.readTSV(path)
    val trainCount: Int = kotlin.math.ceil(fraction * df.nrow).toInt()
    val shuffledDf = df.shuffle()
    return Pair(shuffledDf.slice(0..trainCount), shuffledDf.slice(trainCount + 1..shuffledDf.nrow))
}

In [6]:
class BoW(private val limit: Int = 5, private val n: Int = 1) {

    public var voc: Map<String, Int> = HashMap()
    public var eventsMap: MutableMap<String, Int> = HashMap()
    public lateinit var id2word: List<String>


    
    fun initialize(df: DataFrame) {
        val events = df["events"]
        for (i in 0 until events.length) {
            val events_arr = df["events"][i].toString().split(" , ")
            for (j in 0 until events_arr.size - n + 1) {
                val ev = events_arr.slice(j..j + n - 1).joinToString()
                if (eventsMap.containsKey(ev)) {
                    eventsMap[ev] = eventsMap.getValue(ev) + 1
                } else {
                    eventsMap[ev] = 1
                }
            }
        }

        id2word = eventsMap.filter { it.value > limit}.toSortedMap(compareBy({ eventsMap[it]?.times(-1) }, { it }))
                    .keys.toList()
        
        voc = id2word.withIndex().toList().associate {it.value to it.index}
    }

    fun transform(df: DataFrame): Pair<koma.matrix.Matrix<Double>, List<String>> {
        val mat = zeros(df.nrow, voc.size + 2)
        val label = ArrayList<String>()
        for (i in 0 until df.nrow) {
            mat[i, voc.size] = df["ms"][i] as Int
            mat[i, voc.size + 1] = df["events"][i].toString().split(" , ").size
            label.add(df["Category"][i].toString())
            
            val events_arr = df["events"][i].toString().split(" , ")
            for (j in 0 until events_arr.size - n + 1) {
                val event = events_arr.slice(j..j + n - 1).joinToString()
                if (voc.containsKey(event)) {
                    mat[i, voc[event] as Int] += 1
                }
            }
        }
        return Pair(mat, label)
    }
    
}

In [7]:
val (train, test) = trainTestSplit("../data/test/test_data.tsv")

In [8]:
val bow = BoW(n=1)
bow.initialize(train)

In [66]:
class NaiveBayes( private val alpha: Double = 1e-5, val cat: String = "coding") {

    public var p: Double = 0.0
    public var probs: ArrayList<koma.matrix.Matrix<Double>> = ArrayList()


    fun fit(X: koma.matrix.Matrix<Double>, y: List<String>) {
        probs.add(ones(X.shape()[1] - 2, 11) * alpha)
        probs.add(ones(X.shape()[1] - 2, 11) * alpha)

        for (i in y.indices) {
            var cls = 0
            if (!y[i].toLowerCase().contains(cat))
                cls = 1
            else
                p++


            for (j in 0 until X.shape()[1] - 2) {
                val ind:Int = if (X[i, j] < 10.0) X[i, j].toInt() else 10
                probs[cls][j, ind] += 1

            }

        }

        p /= y.size
        for (cls in 0 until 2) {
            var probsMatrix = probs[cls]
            val den = (1 + alpha) * probsMatrix.getRow(0).elementSum().toInt()
            probsMatrix /= den
            probs[cls] = probsMatrix
        }
    }




    fun logProb(X: koma.matrix.Matrix<Double>): List<Double> {
        var ans = zeros(X.shape()[0], 2)
        for (i in 0 until X.shape()[0]) {
            for (cls in 0 until 2) {
                ans[i, cls] = kotlin.math.ln(p * (1 - cls) + (1 - p) * cls)
                for (k in 0 until X.shape()[1] - 2) {
                    val ind = if (X[i, k] < 10.0) X[i, k].toInt() else 10
                    ans[i, cls] += kotlin.math.ln(probs[cls][k, ind])
                }
            }
        }
        val q = fill(ans.shape()[0], ans.shape()[1]) {i, j -> 1 / (exp(ans[i, 0..ans.shape()[1]-1] - ans[i, j]).mean() * ans.shape()[1]) }
        return q.getCol(0).toList()
    }
    
    fun logProbExpl(X: koma.matrix.Matrix<Double>): ArrayList<List<Double>> {
        var ans = ArrayList<List<Double>> ()
        
        for (i in 0 until X.shape()[0]) {
            var semi = zeros(X.shape()[1] - 2, 1)
            
            for (k in 0 until X.shape()[1] - 2) {
                val ind = if (X[i, k] < 10.0) X[i, k].toInt() else 10
                semi[k] += kotlin.math.ln(probs[0][k, ind]) - kotlin.math.ln(probs[1][k, ind])
            }
            ans.add(semi.toList())
        }
        
        return ans
    }
    
}

In [67]:
val classificator = NaiveBayes(cat="reading")

In [11]:
val (X_bow, y_bow) = bow.transform(train)

In [12]:
val (X_bow_test, y_bow_test) = bow.transform(test)

In [68]:
classificator.fit(X_bow, y_bow)

In [69]:
val pred = classificator.logProbExpl(X_bow)

In [70]:
val pred2 = classificator.logProb(X_bow)

In [113]:
val cv = ArrayList<Pair<String, Double>>()

In [114]:
for (i in bow.id2word.indices) {
    cv.add(Pair(bow.id2word[i], pred[1][i]))
}

In [112]:
pred2[1]

0.999999733862698

In [115]:
cv.sortBy {-it.second}

In [123]:
cv.slice(cv.size - 5 until  cv.size)

[(tooltip.action.events_execute_execute, -0.42744401482694006), (actions_action.invoked_EditorBackSpace, -0.6898082792944309), (file.types.usage_edit_JAVA, -0.8104362670830456), (actions_action.invoked_EditorChooseLookupItem, -1.2159013751912102), (productivity_feature.used_editing.completion.basic, -1.52605630349505)]

In [98]:
bow.voc.toSortedMap(compareBy({ pred[0][bow.voc.getOrDefault(it, 0)] }))

{vcs_fetch.started_fetch.started=65, actions_action.invoked_GotoDeclaration=26, file.types.usage_open_JAVA=23, productivity_feature.used_editing.select.word=6, actions_action.invoked_Find=45, actions_action.invoked_Back=60, ui.dialogs_close_com.intellij.find.impl.FindPopupPanel$2=59, actions_action.invoked_EditorCopy=14, find_search.session.started_FindInFile=47, find_search.session.started_FindInPath=71, actions_action.invoked_FindInPath=72, productivity_feature.used_SearchEverywhere=77, actions_action.invoked_NextOccurence=35, actions_action.invoked_third.party=62, searchEverywhere_contributorItemChosen_contributorItemChosen=86, actions_custom.action.invoked_DoubleShortcut=117, searchEverywhere_dialogClosed_dialogClosed=94, gutter.icon.click_clicked_implementedMethod=239, actions_action.invoked_com.intellij.find.SearchReplaceComponent$11=85, actions_action.invoked_com.intellij.codeInsight.daemon.NavigateAction=158, actions_action.invoked_com.intellij.ui.ScrollingUtil$12=142, actions_

In [71]:
val nexs = ArrayList<Double>()
val pexs = ArrayList<Double>()

In [73]:
val a =  ArrayList<Pair<Double, Int>>()

In [74]:
for (i in y_bow.indices) {
        if (y_bow[i].toLowerCase().contains("reading"))
            a.add(Pair(pred2[i], 1))
        else
            a.add(Pair(pred2[i], 0))
    }

In [75]:
a.sortBy { it.first }

In [76]:
var c = 0.01
var z = DoubleArray(2)

In [77]:
nexs.add(0.0)
pexs.add(0.0)

true

In [78]:
for (v in a) {
    if (v.first > c) {
        c += 0.01
        nexs.add(z[0])
        pexs.add(z[1])
    }
    z[v.second]++
}

In [79]:
nexs.add(z[0])
pexs.add(z[1])

true

In [80]:
var pe = pexs[pexs.size-1]
var ne = nexs[nexs.size-1]

In [81]:
for (i in pexs.indices) {
    pexs[i] = pe - pexs[i]
}

In [82]:
for (i in nexs.indices) {
    nexs[i] = ne - nexs[i]
}

In [83]:
val rec = ArrayList<Double>()
val prec = ArrayList<Double>()

In [84]:
for (i in pexs.indices) {
    rec.add(pexs[i] / pexs[0])
}

In [85]:
for (i in pexs.indices) {
    if (pexs[i] + nexs[i] == 0.0)
        prec.add(1.0)
    else
        prec.add(pexs[i] / (nexs[i] + pexs[i]))
}

In [121]:
rec::class

class java.util.ArrayList

In [120]:
df::class

class java.util.LinkedHashMap

In [86]:
val df  = mapOf("recall" to rec, "precision" to prec)
var p = lets_plot(df) {x = "recall"; y = "precision"}
p += geom_line()

In [87]:
p

In [92]:
prec

[0.23469387755102042, 0.5528455284552846, 0.5666666666666667, 0.5714285714285714, 0.5862068965517241, 0.5964912280701754, 0.6098654708520179, 0.6153846153846154, 0.6267281105990783, 0.6355140186915887, 0.6367924528301887, 0.6398104265402843, 0.6428571428571429, 0.6650246305418719, 0.6716417910447762, 0.675, 0.678391959798995, 0.6818181818181818, 0.6923076923076923, 0.6958762886597938, 0.6994818652849741, 0.7068062827225131, 0.7105263157894737, 0.7142857142857143, 0.7180851063829787, 0.7219251336898396, 0.7258064516129032, 0.75, 0.7486033519553073, 0.7570621468926554, 0.7657142857142857, 0.7701149425287356, 0.7745664739884393, 0.7790697674418605, 0.783625730994152, 0.788235294117647, 0.7928994082840237, 0.7976190476190477, 0.8023952095808383, 0.8072289156626506, 0.806060606060606, 0.8109756097560976, 0.8159509202453987, 0.8209876543209876, 0.8198757763975155, 0.81875, 0.8176100628930818, 0.8227848101265823, 0.8280254777070064, 0.8333333333333334, 0.8387096774193549, 0.8376623376623377, 

In [88]:
fun integrate(x: ArrayList<Double>, y: ArrayList<Double>): Double {
    var res = 0.0
    for (i in 0 until y.size - 1) {
        res -= (x[i+1] - x[i]) * (y[i+1] + y[i]) / 2
    }
    return res
}

In [91]:
integrate(rec, prec)

0.9566777051200045

In [128]:
pred2.getRow(41)

mat[ 0,00013816248317,  0,99986183751683 ]

In [69]:
val vl: MutableMap<String, Double> = HashMap()

In [129]:
y_bow[41]

RDB

In [70]:
for (i in bow.id2word.indices){
    vl[bow.id2word[i]] = pred[1][i]
}

In [71]:
vl.toSortedMap(compareBy({ vl[it]?.times(-1) }, { it }))

{file.types.usage_edit_JAVA=14.759013853363316, actions_action.invoked_EditorUp=14.43050978639128, completion_finished_finished=14.025044678283114, actions_action.invoked_EditorEnter=12.926432389615005, productivity_feature.used_editing.completion.variable.name=11.540138028495114, actions_action.invoked_EditorBackSpace=3.8114021974431465, actions_action.invoked_EditorDown=2.9716515426913253, productivity_feature.used_editing.completion.basic=2.735262764627095, actions_action.invoked_EditorChooseLookupItem=2.4251078363232557, actions_action.invoked_EditorPaste=1.723661852948615, tooltip.action.events_execute_execute=1.6366504759589855, actions_action.invoked_EditorCopy=1.1060222248968148, productivity_feature.used_editing.select.word=0.7691499082542621, actions_action.invoked_GotoDeclaration=0.18973149302265968, productivity_feature.used_navigation.goto.declaration=0.18973149302265968, ui.dialogs_show_com.intellij.openapi.progress.util.ProgressDialog$MyDialogWrapper=0.0869317982265071, 

In [86]:
y_bow_test[7]

coding, reading

In [88]:
pred.getCol(0) - pred.getCol(1) 

mat[ 0,99874451496689  end
    -0,99997151031942  end
    -0,00679700815238  end
     0,00000000043445  end
     0,01017132468425  end
     0,01017132468425  end
    -1,00              end
     0,02740376274127  end
     0,01017132468425  end
     0,00975731423155  end
     0,01017132468425  end
    -0,00124898344487  end
    -1,00              end
    -0,02149639871889  end
    -1,00              end
     0,06888302545698  end
     0,99850331741914  end
     0,99999999985941  end
     0,04788329856067  end
     0,00000000000002  end
    -0,00001643993483  end
    -0,00              end
     0,00000029323433  end
    -0,00001148326648  end
     0,00000000953435  end
    -0,00000424495427  end
    -0,00002217130782  end
    -1,00              end
     0,0000000000004   end
    -0,00000004698286  end
    -1,00              end
    -0,00000021244324  end
     0,01017132468425  end
    -1,00              end
     0,01017132468425  end
    -0,99999996756759  end
     0,01017132468425  end
 

In [70]:
pred.getCol(0).toList()

[1.5898703786415755E-10, 1.6624047526703611E-15, 1.0, 1.0, 7.490350787394148E-5, 0.9999999997297138, 1.0, 1.0, 1.0, 1.0, 5.8159047591118066E-11, 3.680435685973332E-5, 6.293971204697163E-5, 0.0019099319838452992, 0.99999974317813, 6.267870783419005E-10, 6.293971204697163E-5, 0.999999855200257, 6.447355462017577E-10, 1.0, 1.0, 1.856220170940937E-5, 9.032305370312738E-15, 1.0, 1.0, 6.293971204697163E-5, 3.707665259914707E-4, 1.0280850870327827E-4, 0.9921305995090737, 2.472681044953678E-8, 6.293971204697163E-5, 5.0094153284637565E-5, 1.0, 4.627613288412803E-28, 1.0, 1.0, 8.863875480793987E-6, 0.9999999999999993, 6.293971204697163E-5, 0.9999999999985849, 6.424595656660162E-10, 0.9999999167365994, 1.2893275155334411E-14, 1.0, 0.9999999913929348, 1.0, 6.969625172065914E-11, 1.0, 0.980470440890968, 1.0, 4.512260059343055E-5, 2.5079506143465025E-5, 4.299647828242927E-9, 1.0, 3.028109206673753E-17, 6.293971204697163E-5, 0.9999999198369669, 1.0, 1.0, 8.406293257306347E-10, 4.751071278886882E-4, 0