In [147]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import json
import re
from datetime import datetime
import random

from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer

import nltk

from difflib import SequenceMatcher

In [26]:
np.random.seed(42)

In [3]:
grep_df = pd.read_csv('/root/data/lexical/grep_findings_0_499.csv')

In [204]:
snippets = random.choices(grep_df['text'], k=100)
snippets

['\t\thexdumpWords(b, b+n, func(p uintptr) byte {\n',
 '\t\t\t\td1.pc = frame.fn.entry + uintptr(frame.fn.deferreturn)\n',
 '\tadjustpointer(adjinfo, unsafe.Pointer(&gp.sched.bp))\n',
 'func makeheapobjbv(p uintptr, size uintptr) bitvector {\n',
 "// Package reflect's Value methods named Pointer and UnsafeAddr return type uintptr\n",
 'func (fd *FD) RawRead(f func(uintptr) bool) error {\n',
 '\treturn strhash(noescape(unsafe.Pointer(&s)), seed)\n',
 '\tsearchAddr uintptr\n',
 'func (v Value) pointer() unsafe.Pointer {\n',
 '\t\tmheap_.specialprofilealloc.free(unsafe.Pointer(sp))\n',
 '\tif len(x) > 4 && datap.etext <= uintptr(str.str) && uintptr(str.str) < datap.end {\n',
 '\t\tuadd += unsafe.Sizeof(uncommontype{})\n',
 '\t\t\t\t\tunsafe.Offsetof(finalizer{}.ot) != 4*sys.PtrSize) {\n',
 '\t\tval := *(*uintptr)(unsafe.Pointer(p + i))\n',
 '// so not all have these arguments. Mark them uintptr so that the GC\n',
 '\t_, _, e1 := Syscall(SYS_KEYCTL, uintptr(cmd), uintptr(arg2), 0)\n',
 'fu

## Extract Features

In [86]:
token_fd = nltk.FreqDist(set.union(*[tokenize(snippet) for snippet in snippets]))
all_tokens = [token for token, _ in token_fd.most_common(50)]

In [66]:
def tokenize(snippet):
    return set(list(re.split('[\s\(\)]+', snippet)))

def extract_features(snippet, all_tokens):
    tokens = tokenize(snippet)
    return {"contains_{}".format(token): token in tokens for token in all_tokens}

In [87]:
data = [extract_features(snippet, all_tokens) for snippet in snippets]

In [88]:
all_tokens

['',
 'h',
 'dataOffset+inserti*8',
 'buckets',
 '_cgo_mmap',
 'integer,',
 'datap.edata',
 'err',
 'selectnbrecv',
 'int',
 'mallocgc',
 'sp.str,',
 'base',
 'named',
 'ret',
 'dumpobj',
 '*arraytype',
 'strhash',
 'reflect_chansend',
 'selectnbsend',
 'frame',
 '&bv,',
 'oldbucket+newbit',
 'allocs,',
 'may',
 'cpuprof.extra[i]',
 'local_scan',
 'mapaccessK',
 'memhash',
 'memEnd',
 'uintptr',
 'bucketMask',
 'dumpint',
 "uintptr's",
 'new',
 'gp._panic',
 'handles',
 'dataOffset+i*8',
 'chanLock',
 'stack',
 'Loadp',
 'inheap',
 'uint64',
 'd',
 'old',
 'return',
 'minLegalPointer',
 'datap.bss',
 'or',
 'callCgoMmap']

## K-Means Clustering

In [89]:
v = DictVectorizer(sparse=False)

X = v.fit_transform(data)

In [90]:
cls = KMeans(init='k-means++', n_clusters=8).fit(X)

In [112]:
def vector_to_string(tokens):
    for key, value in tokens.items():
        if value > 0.001:
            print(key[len("contains_"):])

In [116]:
vector_to_string(v.inverse_transform(cls.cluster_centers_)[3])


new
old
return
uintptr


## Aligned Hamming Distance Clustering

In [136]:
def aligned_distance(a, b):
    s = SequenceMatcher()

    s.set_seq1(a)
    s.set_seq2(b)

    a_str = ""
    op_str = ""
    b_str = ""

    codes = s.get_opcodes()
    for code in codes:
        a_str += a[code[1]:code[2]]
        b_str += b[code[3]:code[4]]

        if code[4]-code[3] > code[2]-code[1]:
            a_str += " " * ((code[4]-code[3]) - (code[2]-code[1]))
        elif code[4]-code[3] < code[2]-code[1]:
            b_str += " " * ((code[2]-code[1]) - (code[4]-code[3]))

    return hamming(a_str, b_str)

def hamming(a, b):
    distance = 0
    
    for i in range(max(len(a), len(b))):
        if i >=len(a) or i >=len(b):
            distance += 1
            continue
        if a[i] != b[i]:
            distance += 1
            
    return distance

In [137]:
a = "r1, r2, err := getConsoleCursorInfoProc.Call(handle, uintptr(unsafe.Pointer(cursorInfo)), 0)"
b = "r1, r2, err := scrollConsoleScreenBufferProc.Call(handle, uintptr(unsafe.Pointer(&scrollRect)), uintptr(unsafe.Pointer(&clipRect)), coordToPointer(destOrigin), uintptr(unsafe.Pointer(&char)))"

print(aligned_distance(a, b))

125


In [138]:
a = "Hallo Welt!"
b = "Hallo, du schönes Geld!"

print(aligned_distance(a, b))
print(hamming(a, b))

12
18


In [205]:
distances = np.array([[aligned_distance(a, b) for b in snippets] for a in snippets])

In [206]:
distances

array([[ 0, 56, 63, ..., 69, 35, 33],
       [51,  0, 58, ..., 81, 49, 44],
       [63, 79,  0, ..., 59, 35, 44],
       ...,
       [79, 80, 52, ...,  0, 43, 61],
       [41, 55, 35, ..., 43,  0, 25],
       [33, 39, 44, ..., 52, 23,  0]])

In [207]:
max(distances.flatten())

197

In [213]:
THRESHOLD = 100

clusters = []
unused_indices = set(range(len(snippets)))

while len(unused_indices) > 0:
    next_index = list(unused_indices)[0]
    cluster_indices = set([i for i, d in enumerate(distances[next_index]) if d < THRESHOLD])
    
    clusters.append([snippets[i] for i in cluster_indices])
    
    unused_indices -= cluster_indices

In [214]:
for cluster in clusters:
    print(len(cluster))

91
57
6
1
45


In [215]:
for cluster in clusters:
    for snippet in cluster:
        print(snippet.strip())
    print("")
    print("-----------------------------")
    print("")

hexdumpWords(b, b+n, func(p uintptr) byte {
d1.pc = frame.fn.entry + uintptr(frame.fn.deferreturn)
adjustpointer(adjinfo, unsafe.Pointer(&gp.sched.bp))
func makeheapobjbv(p uintptr, size uintptr) bitvector {
// Package reflect's Value methods named Pointer and UnsafeAddr return type uintptr
func (fd *FD) RawRead(f func(uintptr) bool) error {
return strhash(noescape(unsafe.Pointer(&s)), seed)
searchAddr uintptr
func (v Value) pointer() unsafe.Pointer {
mheap_.specialprofilealloc.free(unsafe.Pointer(sp))
if len(x) > 4 && datap.etext <= uintptr(str.str) && uintptr(str.str) < datap.end {
uadd += unsafe.Sizeof(uncommontype{})
unsafe.Offsetof(finalizer{}.ot) != 4*sys.PtrSize) {
val := *(*uintptr)(unsafe.Pointer(p + i))
// so not all have these arguments. Mark them uintptr so that the GC
_, _, e1 := Syscall(SYS_KEYCTL, uintptr(cmd), uintptr(arg2), 0)
func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
func reflectlite_resolveTypeOff(rtype unsafe.Pointer, off int32) unsafe.