In [1]:
import pyspark
import datetime

In [2]:
sc = pyspark.SparkContext(master="spark://10.0.0.3:6060")
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

http://10.0.0.3:4040/

In [3]:
from pyspark.sql import functions as F
import pyspark
import numpy as np
import sys

def seedKernel(dataB, dataIdValue, centroids, k, metric):
    data = dataB.value
    point = dataIdValue[1]
    minD = sys.maxsize 
    for j in range(len(centroids)): 
        distance = metric(point, data[centroids[j]]) 
        minD = min(minD, distance) 
    return int(minD)

def seedClusters(dataB, dataFrame, k, metric):
    data = dataB.value
    centroids = list(np.random.choice(data.shape[0], 1, replace=False))
    for i in range(k - 1):
        dist = []
        mK = dataFrame.rdd.map(lambda dataIdValue: seedKernel(dataB, dataIdValue, centroids, k, metric))
        mK_collect = mK.collect()
        dist = np.array(mK_collect) 
        next_centroid = np.argmax(dist)
        centroids.append(next_centroid) 
        dist = []
    return centroids 

def nearestCenteroidKernel(dataIdValue, centeroidIdValues, metric):
    dataId, dataValue = dataIdValue
    dataNp = np.asarray(dataValue)
    distances = []
    for centeroidId, centeroidValue in centeroidIdValues:
        centeroidNp = np.asarray(centeroidValue)
        distance = metric(dataNp, centeroidNp)
        distances.append(distance)
    distances = np.asarray(distances)
    closestCenteroid = np.argmin(distances)
    return int(closestCenteroid)

def optimiseClusterMembershipSpark(data, dataFrame, n, metric, intitalClusterIndices=None):
    dataShape = data.shape
    dataRDD = dataFrame.rdd
    lengthOfData = dataShape[0]
    if intitalClusterIndices is None:
        index = np.random.choice(lengthOfData, n, replace=False)
    else:
        index = intitalClusterIndices
    listIndex = [int(i) for i in list(index)]
    centeroidIdValues = [(i,data[index[i]]) for i in range(len(index))]
    dataRDD = dataRDD.filter(lambda dataIdValue: int(dataIdValue["id"]) not in listIndex)
    associatedClusterPoints = dataRDD.map(lambda dataIdValue: (dataIdValue[0],nearestCenteroidKernel(dataIdValue, centeroidIdValues, metric)))
    clusters = associatedClusterPoints.toDF(["id", "bestC"]).groupBy("bestC").agg(F.collect_list("id").alias("cluster"))
    return index, clusters

def costKernel(dataB, testCenteroid, clusterData, metric):
    data = dataB.value
    cluster = np.asarray(clusterData)
    lenCluster = cluster.shape[0]
    lenFeature = data.shape[1]
    testCenteroidColumn = np.zeros(shape=(lenCluster, lenFeature), dtype=data.dtype)
    newClusterColumn = np.zeros(shape=(lenCluster, lenFeature), dtype=data.dtype)
    for i in range(0, lenCluster):
        newClusterColumn[i] = data[cluster[i]]
        testCenteroidColumn[i] = data[int(testCenteroid)] 
    pairwiseDistance =  metric(newClusterColumn, testCenteroidColumn)# (np.absolute(newClusterColumn-testCenteroidColumn).sum(axis=1))# metric(newClusterColumn, testCenteroidColumn)
    cost = np.sum(pairwiseDistance)
    return float(cost) #newClusterColumn.shape[1]

def optimiseCentroidSelectionSpark(dataB, dataFrame, centeroids, clustersFrames, metric):
    data = dataB.value
    dataRDD = dataFrame.rdd
    dataShape = data.shape
    newCenteroidIds = []
    totalCost = 0
    totalTime = None
    for clusterIdx in range(len(centeroids)):
        clusterStartTime = datetime.datetime.now()
        oldCenteroid = centeroids[clusterIdx]
        clusterFrame = clustersFrames.filter(clustersFrames.bestC == clusterIdx).select(F.explode(clustersFrames.cluster))
        clusterData = clusterFrame.collect()
        if clusterData:
            clusterData = [clusterData[i].col for i in range(len(clusterData))]
        else:
            clusterData = []
        cluster = np.asarray(clusterData)
        costData = clusterFrame.rdd.map(lambda pointId: (pointId[0], costKernel(dataB, pointId[0], clusterData, metric)))
        cost = costData.map(lambda pointIdCost: pointIdCost[1]).sum()
        pointResult = costData.takeOrdered(1, key = lambda pointId_Cost: pointId_Cost[1])
        totalCost = totalCost + cost
        if (pointResult):
            bestPoint = pointResult[0][0]
        else:
            bestPoint = oldCenteroid
        newCenteroidIds.append(bestPoint)
        clusterTime = datetime.datetime.now() - clusterStartTime
        if totalTime is not None:
            totalTime = totalTime + clusterTime
        else:
            totalTime = clusterTime
    return (newCenteroidIds, totalCost)

#vector metrics
def hammingVector(stack1, stack2):
    return (stack1 != stack2).sum(axis=1)
def euclideanVector(stack1, stack2):
    return (np.absolute(stack2-stack1)).sum(axis=1)
# point metrics
def euclideanPoint(p1, p2): 
    return np.sum((p1 - p2)**2) 
def hammingPoint(p1, p2): 
    return np.sum((p1 != p2))

def fit(sc, data, nRegions = 2, metric = "euclidean", seeding = "heuristic"):
    if metric == "euclidean":
        pointMetric = euclideanPoint
        vectorMetric = euclideanVector
    elif metric == "hamming":
        pointMetric = hammingPoint
        vectorMetric = hammingVector
    else:
        print("unsuported metric")
        return

    dataN = np.asarray(data)
    dataB = sc.broadcast(dataN)
    seeds = None
    dataFrame  = sc.parallelize(data).zipWithIndex().map(lambda xy: (xy[1],xy[0])).toDF(["id", "vector"]).cache()
    if (seeding == "heuristic"):
        seeds = list(seedClusters(dataB, dataFrame, nRegions, pointMetric))
    elif (seeding == "random"):
        seeds = None
    else:
        seeds = seeding
    lastCenteroids, lastClusters = optimiseClusterMembershipSpark(dataN, dataFrame, nRegions, pointMetric, seeds)
    lastCost = float('inf')
    iteration = 0
    escape = False
    while not escape:
        iteration = iteration + 1
        currentCenteroids, currentCost = optimiseCentroidSelectionSpark(dataB, dataFrame, lastCenteroids, lastClusters, vectorMetric)
        currentCenteroids, currentClusters = optimiseClusterMembershipSpark(dataN, dataFrame, nRegions, pointMetric, currentCenteroids)
        if (currentCost<lastCost):
            lastCost = currentCost
            lastCenteroids = currentCenteroids
            lastClusters = currentClusters
        else:
            escape = True
    bc = lastClusters.collect()
    clusterObj = {}
    for i in range(len(bc)):
        clusterObj[str(bc[i].bestC)] = bc[i].cluster
    unpackedClusters = []
    for i in range(len(lastCenteroids)):
        stri = str(i)
        if stri in clusterObj:
            unpackedClusters.append(clusterObj[stri])
        else:
            unpackedClusters.append([])
    return (lastCenteroids, unpackedClusters)

In [4]:
import numpy as np #maths
visualFeatureVocabulary = None
visualFeatureVocabularyList = None
with open("data/ORBvoc.txt", "r") as fin:
    extractedFeatures = list(map(lambda x: x.split(" ")[2:-2], fin.readlines()[1:]))
    dedupedFeatureStrings = set()
    for extractedFeature in extractedFeatures:
        strRep = ".".join(extractedFeature)
        dedupedFeatureStrings.add(strRep)
    finalFeatures = []
    for dedupedFeatureStr in list(dedupedFeatureStrings):
        finalFeatures.append([int(i) for i in dedupedFeatureStr.split(".")])
    visualFeatureVocabulary = np.asarray(finalFeatures, dtype=np.uint8)
    visualFeatureVocabularyList  = list(finalFeatures)
print(visualFeatureVocabulary.shape)

(1062686, 32)


In [5]:
from random import sample
visualFeatureVocabularyListSample = sample(visualFeatureVocabularyList, 50000)
len(visualFeatureVocabularyListSample)

50000

In [6]:
class Layer:
    levelData = None
    children = []
    parent = None
    wordLayer = False
    def __init__(self, levelData, wordLayer = False, children=None, parent=None):
        self.parent = parent
        self.wordLayer = wordLayer
        self.children = children
        self.levelData = np.asarray(levelData)
#global_clusterIndices = None
#temp = None
def createLayer(sc, centroidIndices, clustersIndicies, data, parent=None, k=110, metric="hamming"):
    #global global_clusterIndices
    #global temp
    centroidData = np.asarray([data[i] for i in centroidIndices])
    children = []
    parentLayer = Layer(centroidData, False, children, parent)
    for i in range(len(centroidIndices)):
        clusterIndices = clustersIndicies[i]
        #global_clusterIndices = clusterIndices
        clusterData = [data[i] for i in clusterIndices]
        lenCluster = len(clusterIndices)
        #print("lenCluster, k", lenCluster, k)
        if lenCluster > k:
            # we should fit again
            childCenteroids, childClusters = fit(sc, clusterData, k, metric)
            clusterLayer = createLayer(sc, childCenteroids, childClusters, clusterData, parentLayer, k, metric),
            children.append(clusterLayer)
        else:
            clusterLayer = Layer(clusterData, True, None, parentLayer)
            children.append(clusterLayer)
    parentLayer.children = children
    return parentLayer

def doAll(sc, data, k=4, metric="hamming"):
    bestCentroidsORB, bestClustersORB = fit(sc, data, k, metric)
    return createLayer(sc, bestCentroidsORB, bestClustersORB, data, None, k, metric)

In [None]:
%%time
import pickle
from os import path

layers = doAll(sc, visualFeatureVocabularyListSample)

In [37]:
layers.levelData

array([[ 63,  95, 107, 207, 118, 155, 255, 252,  77, 232, 254, 230, 121,
         95, 183, 228, 126, 253, 151, 239, 205, 178,  45, 255, 118, 125,
        175, 128, 127, 191, 121, 253],
       [254, 188, 246, 186, 237, 253, 254, 239, 191, 223, 252,  63, 238,
        254, 126, 223, 239, 251, 255, 151, 190, 111, 218, 243, 191, 223,
        223, 255, 159, 125, 239, 223],
       [237, 112, 189, 103,  40,  78,  71, 152,  99, 250, 167,  24, 215,
        119,  73,  59, 145, 212, 124, 106, 105, 100, 251,   0, 197, 251,
        101,  52,  66, 243,  70, 114],
       [ 88,  89, 126, 236,  25,  94,  83, 112, 165, 140, 238,   0, 127,
        215,  35, 114, 240, 244, 253,  98,  72, 131, 176, 140, 243, 253,
         32,  27,  34, 166, 108, 123]])

In [38]:
test = sample(visualFeatureVocabularyList, 1)[0]
test = np.asarray(test)
test = np.asarray([210,  13,  80, 128, 155, 224,  72, 120, 234, 104,  92,  75, 232, 141, 216, 238,  34, 172,
 198, 218, 191, 113, 205, 136,  88, 150, 203,  57, 204,  91, 129,  83])

In [39]:
def compareLevelData(levelData, vec, metric=hammingVector):
    lenLevelData = levelData.shape[0]
    #print(levelData, levelData.shape)
    lenFeature = levelData.shape[1]
    testColumn = np.zeros(shape=(lenLevelData, lenFeature), dtype=levelData.dtype)
    for i in range(0, lenLevelData):
        testColumn[i] = vec
    pairwiseDistance =  metric(levelData, vec)
    minIndex = np.argmin(pairwiseDistance)
    return int(np.argmin(pairwiseDistance))

def findWordInternal(_layers, vec, path=[], level=-1):
    levelData = _layers.levelData
    closestChild = compareLevelData(levelData, vec)
    path.append(closestChild)
    closestChildLayer = _layers.children[closestChild]
    if type(closestChildLayer) is tuple:
        closestChildLayer = closestChildLayer[0]
    if closestChildLayer.wordLayer:
        #print("llll", closestChildLayer.levelData)
        if closestChildLayer.levelData.shape[0] != 0:
            closestGrandChild = compareLevelData(closestChildLayer.levelData, vec)
            path.append(closestGrandChild)
            return path
        else:
            return path
    else:
        return findWordInternal(closestChildLayer, vec, path)

    
def findWord(_layers, vec):
    returnData = findWordInternal(_layers, vec, [])
    #print(returnData)
    #print("#####################")
    #print(returnData)
    #print("#####################")
    #if isinstance(returnData, list): 
    #    return "-".join([str(i) for i in returnData])
    #else:
    return "-".join([str(i) for i in returnData])
    #return returnData
findWord(layers, test)

'0-0-0-0-3-0-0-1-0-0'

In [40]:
def traverseAllWords(_layers, above=[], mapp={}, Z=0):
    if _layers.children is not None:
        #iterate children
        children = []
        for i in range(len(_layers.children)):
            if type(_layers.children[i]) is tuple:
                child = _layers.children[i][0] # traverseAllWords(_layers.children[i][0])
            else:
                child = _layers.children[i] # traverseAllWords()
            children.append(child)
        #print(Z, (children), above)
        if len(children) == 0:
            paths = [above + [i] for i in range(_layers.levelData.shape[0])]
            for path in paths:
                key = "-".join([str(j) for j in path])
                mapp[key] = True
            return
        for i in range(len(children)):
            Y = Z + 1
            traverseAllWords(children[i], above + [i], mapp, Y)
    else:
        paths = [above + [i] for i in range(_layers.levelData.shape[0])]
        #print(Z, "no Children", paths, above, _layers.levelData)
        if _layers.levelData.shape[0] != 0:
            for path in paths:
                key = "-".join([str(j) for j in path])
                mapp[key] = True
        else:
            mapp["-".join([str(j) for j in above])] = True
    outputMap = {}
    dictKeys = list(mapp.keys())
    for i in range(len(dictKeys)):
        outputMap[dictKeys[i]] = i
    return outputMap

wordToVecMap = traverseAllWords(layers)

In [41]:
def WordToVec(vec, _wordToVecMap, _layers, bowSize, weights=None):
    wordId = findWord(_layers, vec)
    #print(wordId)
    vecId = _wordToVecMap[wordId]
    #print(vecId)
    feature = np.zeros(shape=(1, bowSize), dtype=vec.dtype)
    #print(feature.shape)
    #weight!
    if weights is not None:
        weight = 1
        if wordId in weights:
            weight = weights[wordId]            
        feature[0][vecId] = weight   
    else:
        feature[0][vecId] = 1
    return feature
bowVec = WordToVec(test, wordToVecMap, layers, len(wordToVecMap.keys()))

In [42]:
np.count_nonzero(bowVec)

1

In [43]:
#get phrase weight over whole data set
def getWordWeights(_data, _layers):
    weights = {}
    total = 0
    for point in _data:
        total += 1
        wordId = findWord(_layers, point)
        #print(wordId)
        if wordId not in weights:
            weights[wordId] = 1
        else:
            weights[wordId] = weights[wordId] + 1
    #print(total, weights)
    for key in weights.keys():
        weights[key] = weights[key] / total
    return weights
weights = getWordWeights(visualFeatureVocabularyListSample, layers)
#print(weights)
    

In [44]:
# get frame and feature stack

In [45]:
import cv2 #open computer vision
from matplotlib import pyplot as plt #plotting library

In [46]:
"""if 'cap' not in globals():
    cap = cv2.VideoCapture(0)

#Init orb feature detector/computer
if 'orb' not in globals():
    orb = cv2.ORB_create(edgeThreshold=2, patchSize=100, nlevels=15, fastThreshold=4, nfeatures=1000000, scoreType=cv2.ORB_FAST_SCORE, firstLevel=0)
"""

"if 'cap' not in globals():\n    cap = cv2.VideoCapture(0)\n\n#Init orb feature detector/computer\nif 'orb' not in globals():\n    orb = cv2.ORB_create(edgeThreshold=2, patchSize=100, nlevels=15, fastThreshold=4, nfeatures=1000000, scoreType=cv2.ORB_FAST_SCORE, firstLevel=0)\n"

In [47]:
#ret, input_image = cap.read() 
#plt.imshow(input_image[:,:,::-1]) #[:,:,::-1] converts BGR to RGB

In [48]:
#Detect and compute orb keypoints and keypoint descriptors from input image.
#kp1, des1 = orb.detectAndCompute(input_image, None)

In [49]:
#draw keypoints onto image 
#output_image = cv2.drawKeypoints(input_image,kp1,color=(0,255,0), outImage=None, flags=0)

In [50]:
#plot result
#plt.imshow(output_image[:,:,::-1])

In [51]:
#des1.shape[0]

In [52]:
def frameTOBowVec(bowSize):
    ret, input_image = cap.read() 
    kp1, des1 = orb.detectAndCompute(input_image, None)
    bowVec = np.zeros(shape=(1, bowSize), dtype=des1.dtype)
    for i in range(des1.shape[0]):
        wordVec = WordToVec(des1[i], wordToVecMap, layers, bowSize)
        bowVec  = bowVec + wordVec
    #bowVec = bowVec / des1.shape[0]
    #normalise
    sumBow = np.sum(bowVec)
    bowVec = bowVec / sumBow
    return bowVec



In [53]:
%%time
#phrase = frameTOBowVec(len(wordToVecMap.keys()))
#print(np.count_nonzero(phrase), phrase.shape)



CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [54]:
# plot phrase
def plotPhrase(phrase):
    lenFeatures = phrase.shape[1]
    x = []
    y = []
    for i in range(lenFeatures):
        y.append(phrase[0][i])
        x.append(i)
    plt.plot(x, y)
#plotPhrase(phrase)

In [55]:
#todo fix findWord and fix serialisation
class LayersLoader:
    def __init__(self):
        pass

In [56]:
#layers.children[71][0].children[94].children is None #71, 94

In [57]:
#layers.children[3][0].children[33].levelData.shape[0]


### 

In [58]:
#wordToVecMap, weights
# generate leveldata map
layers
def getLevelDict(_layers, outputMap = {}, above=[]):
    if len(above) == 0:
        outputMap["root"] = _layers.levelData.tolist()
    if _layers.children is not None:
        children = []
        lenChildren = len(_layers.children)
        for i in range(lenChildren):
            if type(_layers.children[i]) is tuple:
                child = _layers.children[i][0]
            else:
                child = _layers.children[i]
            children.append(child)
        if lenChildren == 0: #does nothing
            for i in range(_layers.levelData.shape[0]):
                path = above + [i]
                key = "-".join([str(j) for j in path])
                outputMap[key] = children[i].levelData.tolist()
            return
        for i in range(lenChildren):
            path = above + [i]
            key = "-".join([str(j) for j in path])
            outputMap[key] = children[i].levelData.tolist()
            getLevelDict(children[i], outputMap, above + [i])
    else:
        outputMap["-".join([str(j) for j in above])] = _layers.levelData.tolist()
    return outputMap

levelDataDict = getLevelDict(layers)
#print(levelDataDict.keys())

In [59]:
#levelDataDict["109"]

In [60]:
#layers.children[109][0].children[0].levelData

In [61]:
# generate children map
#root => [0, .., 109]
#0 => [0, ... 109] aka ["0-0", "0-1"]
def generateChildrenIdMap(_layers, parent=None, outputMap={}):
    if _layers.children is not None:
        children = []
        lenChildren = len(_layers.children)
        for i in range(lenChildren):
            if type(_layers.children[i]) is tuple:
                child = _layers.children[i][0] # traverseAllWords(_layers.children[i][0])
            else:
                child = _layers.children[i] # traverseAllWords()
            children.append(child)
        if parent is None:
            outputMap["root"] = [str(i) for i in range(lenChildren)]
            for i in range(lenChildren):
                generateChildrenIdMap(children[i], [i], outputMap)
        else:
            # parent  [0, 0, 3]
            myLevelKey = "-".join([str(j) for j in parent])
            outputMap[myLevelKey] = []
            for i in range(lenChildren):
                childPath = parent + [i]
                childKey = "-".join([str(j) for j in childPath])
                outputMap[myLevelKey].append(childKey)
                generateChildrenIdMap(children[i], [i], outputMap)
    return outputMap
#dont need this i dont think
childMap = generateChildrenIdMap(layers)
print(childMap)
        
    

{'root': ['0', '1', '2', '3'], '0': ['0-0', '0-1', '0-2', '0-3'], '1': ['1-0', '1-1', '1-2', '1-3'], '2': ['2-0', '2-1', '2-2', '2-3', '2-3', '2-3'], '3': ['3-0', '3-1', '3-2', '3-3']}


In [62]:
# we can finally serialise our model....
# you ready for this?

model = {
    "data": levelDataDict,
    "children": childMap, # depricate
    "wordIndex": wordToVecMap,
    "wordWeights": weights
}

model["wordWeights"]

{'1-3-1-1-3-1-0-0': 8e-05,
 '0-1-3-0-0-1-1-0': 4e-05,
 '2-1-0-0-1-0-1-0-0-1-3': 6e-05,
 '0-2-0-0-3-0-1-0-3-0': 4e-05,
 '1-1-1-1-3-0-0-2': 2e-05,
 '1-3-1-0-2-1-3-0': 2e-05,
 '2-1-0-0-1-0-0-2-0-0-1-1': 8e-05,
 '2-2-2-1-0-2-0-0': 4e-05,
 '1-0-0-0-1-0-1-0-0': 4e-05,
 '0-0-1-1-0-1-1-0-2-0': 4e-05,
 '0-3-0-2-0-0-1-2-0': 4e-05,
 '0-0-0-1-0-1-0-3-1': 2e-05,
 '0-2-1-0-2-3-0': 4e-05,
 '1-3-2-1-1-2-0': 2e-05,
 '2-1-0-3-0-1-1': 4e-05,
 '0-0-2-0-1-0-0-3-3-0': 2e-05,
 '2-0-1-2-1-0-3-0-0-3': 8e-05,
 '0-0-2-1-1-2-1': 2e-05,
 '0-1-3-2-0-0-0-0': 4e-05,
 '0-0-0-3-1-0-0-0-1-0': 4e-05,
 '1-0-1-0-1-1-1-0': 4e-05,
 '1-0-0-2-1-0-0-1-1': 2e-05,
 '0-0-0-0-0-1-1-2-1-0': 4e-05,
 '2-1-0-1-1-3-0-0-0': 4e-05,
 '3-0-0-3-1-1-1': 6e-05,
 '0-2-0-2-0-2-3-0': 6e-05,
 '0-1-1-1-1-1-0-0-0-3': 2e-05,
 '1-0-0-3-0-0-0-1-0-0-0': 8e-05,
 '0-1-2-1-3-0-2': 4e-05,
 '1-3-2-1-3-1-2': 4e-05,
 '0-3-1-2-0-2-2-0': 4e-05,
 '0-0-0-3-2-1-0-1-3': 2e-05,
 '1-1-0-0-1-0-3-2-0': 6e-05,
 '0-1-3-0-2-0-3-0': 4e-05,
 '0-1-3-0-0-0-1-1-0': 4e-05,
 '0-0

In [63]:
import pickle
with open('visionModelK4N50k-2.pickle', 'wb') as handle:
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

import json
with open('visionModelK4N50k-2.json', 'w') as handle:
    json.dump(model, handle)

In [64]:
layers.children[0]

(<__main__.Layer at 0x7f8f203491d0>,)

In [65]:
layers.children[1][0].children[1][0].children
type(layers.children[1][0].children[1][0])
layers.children[1][0].children[1][0].children[1].wordLayer

AttributeError: 'tuple' object has no attribute 'wordLayer'