In [77]:
%use kotlin-statistics, krangl, kravis, klaxon, lets-plot

In [3]:
import kotlin.math.*

In [4]:
@file:Repository("https://dl.bintray.com/kyonifer/maven")
@file:DependsOn("com.kyonifer:koma-core-ejml:0.12")
@file:DependsOn("com.kyonifer:koma-plotting:0.12")

In [5]:
@file:Repository("https://repo1.maven.org/maven2")
@file:DependsOn("com.github.haifengl:smile-core:2.2.2")

In [6]:
import koma.extensions.*
import koma.*

In [7]:
import smile.math.distance.DynamicTimeWarping

In [8]:
import smile.clustering.HierarchicalClustering
import smile.clustering.linkage.CompleteLinkage

In [9]:
open class Action(val time: Long, val group_id: String, val event_id: String, val dt: String) {
    fun print() {
        println("$time $group_id $event_id $dt")
    }

    fun getEvent(): String {
        return "$group_id; $event_id; $dt"
    }
}


In [10]:
class Session(val id: String, val startTime: Long = 0) {
    var actions: ArrayList<Action> = ArrayList()

    fun add(time: Long, group: String, event: String, action: String) {
        actions.add(Action(time,group,event, action))
    }

    fun print() {
        println(id)
        for (action in actions) {
            action.print()
        }
    }

    fun time(): Long {
        return actions[actions.size - 1].time
    }

    fun events(): String {
        val events = ArrayList<String>()
        for (action in actions)
            events.add(action.getEvent())
        return events.joinToString(separator = " , ")
    }



    fun times(): String {
        val times = ArrayList<String>()
        for (action in actions)
            times.add(action.time.toString())
        return times.joinToString(separator = " , ")
    }
}


In [11]:
fun sessionsFromTSV(path: String): ArrayList<Session> {
    val df = DataFrame.readTSV(path)
    val ses = ArrayList<Session>()
    for (i in 0 until df.nrow) {
        val events = df["events"][i].toString().split(" , ")
//        if (events.size < 3)
//            continue
        val times = df["ms"][i].toString().split(" , ")
        ses.add(Session(df["session_id"][i].toString()))
        for (j in events.indices) {
            val params = events[j].split("_", limit=3)
//             ses[ses.size - 1].add(times[j].toLong(), params[0], params[1], params[2])
            ses[ses.size - 1].add(-1, params[0], params[1], params[2])
        }
    }
    return ses
}

In [12]:
class ActionDist : smile.math.distance.Distance<Action> {
    override fun d(x: Action, y: Action): Double {
        if (x.group_id != y.group_id)
            return 1.0
        if (x.event_id != y.event_id)
            return 1.0
        if (x.dt != y.dt)
            return 1.0
        return 0.0
    }

}

In [13]:
val dtw = DynamicTimeWarping(ActionDist())

fun DTWSession(s1: Session, s2: Session): Double {
    val a1 = Array(s1.actions.size) { i -> s1.actions[i]}
    val a2 = Array(s2.actions.size) { i -> s2.actions[i]}
    
    if (a1.size > 10 && a2.size > 10)
        return 0.0
    if (a1.size > 10 || a2.size > 10)
        return 1.0
    
    return dtw.apply(a1, a2) / (a1.size + a2.size)
//     return dtw.apply(a1, a2) / sqrt(a1.size + a2.size)
}

In [14]:
class DWTDist<T> : smile.math.distance.Distance<T> {
        
    override fun d(x: T, y: T): Double {
        if (x is Session && y is Session)
            return DTWSession(x, y)
        return 1.0
    }

}
val t = DWTDist<Session>()

In [130]:
fun dist(s1: Session, s2: Session, tresh: Int=10): Double {
    val a1 = ArrayList<String>()
    val a2 = ArrayList<String>()

    for (a in s1.actions) {
        a1.add(a.event_id + " " + a.group_id + " " + a.dt)
    }

    for (a in s2.actions) {
        a2.add(a.event_id + " " + a.group_id + " " + a.dt)
    }

    val h1 = a1.groupingBy { it }.eachCount()
    val h2 = a2.groupingBy { it }.eachCount()

    val keys = HashSet<String>()
    keys.addAll(h1.keys)
    keys.addAll(h2.keys)

    val h3 = HashMap<String, Int>()
    for (k in keys) {
        h3[k] = min(h1.getOrDefault(k, 0), h2.getOrDefault(k, 0))
    }

    val v1 = h1.values.sum()
    val v2 = h2.values.sum()
    val v3 = h3.values.sum()

//     val f1: Double = (1.0 * v3) / v1
//     val f2: Double = (1.0 * v3) / v2

//     if (f1 + f2 == 0.0)
//         return 0.0
//     return 2 * f1 * f2 / (f1 + f2)
    if (v1 > tresh && v2 > tresh)
        return 0.0
    if (v1 > tresh || v2 > tresh)
        return 1.0
    
    return 1.0 * (v1 + v2 - 2* v3) / (v1 + v2)

}

In [16]:
val ses = sessionsFromTSV("../data/test/test_data.tsv")
val df = DataFrame.readTSV("../data/test/test_data.tsv")

In [43]:
ses.shuffle()

In [44]:
val d = Array<DoubleArray> (ses.size) {DoubleArray(ses.size)}

In [84]:
val tresh = 100

In [131]:
for (i in ses.indices) {
    for (j in ses.indices) {
        d[i][j] = dist(ses[i], ses[j], 100)
    }
}

In [132]:
val clusters = HierarchicalClustering.fit(CompleteLinkage.of(d))

In [141]:
val clust = IntCol("clust", clusters.partition(16))

In [142]:
val ndf =df.addColumn("clust") {clust}

In [50]:
ndf.filter {it["clust"] eq 0}

session_id,ms,Category,events,clust
010320174dc3848-2e16-4311-b1d9-c375cdc5e870_0,9522,ide_start,"event.log_whitelist.loaded_whitelist.loaded , l...",0
080519115073e28-5728-49ef-9c2b-32cf73599561_47,73478,"ide_close, ide_start, vcs",ui.dialogs_show_com.intellij.openapi.ui.message...,0
2301203c6654363-c5e7-4353-83ed-d4dc97a35802_23,110010,ide_start,"event.log_whitelist.loaded_whitelist.loaded , l...",0
1802202292b047e-5416-4f1a-b91e-75e6b63518c4_6,70497,notifications,actions_action.invoked_WelcomeScreen.ImportProj...,0
0602201c6baf4ff-c013-40a0-960e-58b7c6c3b523_72,63052,ide_start,"event.log_whitelist.loaded_whitelist.loaded , l...",0
2312191e0386684-bedf-43c4-b2ba-35c7e99673e6_6,76995,"coding, ide_start","actions_action.invoked_EditorEnter , file.types...",0
2708191ab7da435-dddd-4e85-903e-bed08fd86f42_0,39066,"ide_start, reading","event.log_whitelist.loaded_whitelist.loaded , l...",0
2502201f9e8e9f6-2864-4683-8656-ff505737faf5_0,33824,ide_start,"event.log_whitelist.loaded_whitelist.loaded , l...",0
2008192a4cb0091-0b85-49f5-98ba-6422fde8c848_0,46680,ide_start,"lifecycle_ide.start_ide.start , event.log_white...",0
1606191517c5450-298c-4291-9291-3506c726fba5_42,28625,ide_start,"event.log_whitelist.loaded_whitelist.loaded , l...",0


In [143]:
// val gr = ndf.filterByRow {(it["clust"] as Int) != 0}.groupBy("clust").groups()
val gr = ndf.groupBy("clust").groups()


var vl = HashMap<Int, Pair<String, Double>>()
var good = 0.0
var tot = 0.0
for (g in gr) {
    val ht = HashMap<String, Int>()
    for (i in 0 until g.nrow) {
        val cats = g["Category"][i].toString().toLowerCase().split(", ")
        for (cat in cats) {
            if (ht.contains(cat)) {
                ht[cat] = ht.getOrDefault(cat, 0) + 1
            }
            else {
                ht[cat] = 0
            }
        }
    }
    var k:String = "lol"
    for (e in ht.keys) {
        val pm:Int = ht.getOrDefault(k, 0)
        val cv:Int = ht.getOrDefault(e, 0)
        if (cv > pm) {
            k = e
        }
    }
    if (k != "lol") {
        vl[g["clust"][0] as Int] = Pair(k, 1.0 * ht.getOrDefault(k, 0) / g.nrow)
        tot += g.nrow
        good += ht.getOrDefault(k, 0)
    }
}

In [144]:
vl // все

{0=(vcs, 0.9), 1=(run_build_debug, 0.75), 2=(coding, 0.7777777777777778), 3=(coding, 0.8888888888888888), 4=(coding, 0.8), 5=(coding, 0.8333333333333334), 6=(coding, 0.9411764705882353), 7=(coding, 0.6), 8=(coding, 0.9787234042553191), 9=(coding, 0.9629629629629629), 10=(run_build_debug, 0.9411764705882353), 11=(ide_start, 0.5833333333333334), 12=(reading, 0.9285714285714286), 13=(coding, 0.4), 14=(coding, 0.3548387096774194), 15=(run_build_debug, 0.9772727272727273)}

In [145]:
good / tot

0.7212276214833759

In [119]:
vl // только короткие

{0=(ide_start, 0.5555555555555556), 1=(coding, 0.971830985915493), 2=(run_build_debug, 0.9622641509433962), 3=(vcs, 0.9285714285714286), 4=(reading, 0.39634146341463417), 5=(coding, 0.9032258064516129)}

In [107]:
good / tot

0.5692307692307692

In [139]:
var res = ArrayList<Double>()
val x = ArrayList<Int>()
for (t in 5 until 100 step 3) {
    for (i in ses.indices) {
        for (j in ses.indices) {
            d[i][j] = dist(ses[i], ses[j], t)
        }
    }
    val clusters = HierarchicalClustering.fit(CompleteLinkage.of(d))
    val clust = IntCol("clust", clusters.partition(1+5))
    val ndf =df.addColumn("clust") {clust}
    
    val gr = ndf.filterByRow {(it["clust"] as Int) != 0}.groupBy("clust").groups()
    var good = 0.0
    var tot = 0.0
    for (g in gr) {
        val ht = HashMap<String, Int>()
        for (i in 0 until g.nrow) {
            val cats = g["Category"][i].toString().toLowerCase().split(", ")
            for (cat in cats) {
                if (ht.contains(cat)) {
                    ht[cat] = ht.getOrDefault(cat, 0) + 1
                }
                else {
                    ht[cat] = 0
                }
            }
        }
        var k:String = "lol"
        for (e in ht.keys) {
            val pm:Int = ht.getOrDefault(k, 0)
            val cv:Int = ht.getOrDefault(e, 0)
            if (cv > pm) {
                k = e
            }
        }
        if (k != "lol") {
            tot += g.nrow
            good += ht.getOrDefault(k, 0)
        }
    }
    res.add(good / tot)
    x.add(t)
}

In [140]:
val pl = mapOf("x" to x, "y" to res)
val p = lets_plot(pl) {x = "x"; y = "y"}
p + geom_line()