### copy file from hooklog Pool to classified directory.
1. see the md5 in given pickle dictionary(generated from Mike's AvVendorReport.py)
2. check if the md5 trace is in hooklog Pool
3. if true, then copy to target family directory. (Do classifying)

In [None]:
import pickle
import os
import shutil

dataPickle = "Mike_1026/01.pickle"
family_root_dir = "hooklogs_by_family_7137HK/"
hkPoolDir = "hooklogs/7137HK/"
# read the target pickle files
with open(dataPickle, 'rb') as handle:
    dataContent = pickle.load(handle)

for famName, md5s in dataContent.items():
    familyDir = family_root_dir + famName
    
    for root, dirs, files in os.walk(hkPoolDir):
        for fEntry in files:
            path = root + fEntry
            md5 = fEntry.split("_")[0]
            if md5 in md5s:
                if not os.path.isdir(familyDir): os.makedirs(familyDir)
                shutil.copyfile(path, familyDir+"/"+fEntry)

### Classify the generation of a variant
1. check whether a variant will fork child process and grand child process.
2. define their relationship
3. separate these processes into different directory in a family directory.

In [None]:
def getOriginalFileName(pid, originalFileNames):
    for fileName in originalFileNames:
        nameSnippet = fileName.split('_')
        number = nameSnippet[len(nameSnippet)-1].split('.')[0]
        if pid == number:
            return fileName
    return None

In [None]:
# Key API Name = CreateProcessInternal
# Key Attribute = dwProcessId

def getSampleRelation(familyPath, sampleMD5Dict):
    keyAPI = "CreateProcessInternal"
    keyAttribute = "dwProcessId"

    result = dict()
    for md5, traceFiles in sampleMD5Dict.items():
        if len(traceFiles) == 1: # if single file, skip it.
            continue

        md5RelationDict = dict() # a dict {key=main: value=child_pid} (or key=child, value=grandchild_pid)
        for trace in traceFiles: # trace all files if they have same md5
#             print(familyPath)
#             print(traceFiles)
            handle = open(familyPath + trace, 'rb')
            child = set()
            while(1):
                line = handle.readline().decode("windows-1252").strip() # MIKE: 20170616, for python 3
                if not line: 
                    break
                if(line[0] is '#'):
                    api = handle.readline().decode("windows-1252").strip() # see api name
                    if(api == keyAPI):
                        terminateCtr = 0
                        while(terminateCtr<5):
                            newLine = handle.readline().decode("windows-1252").strip()
                            if(newLine[0:6] == "Return"):
                                if(newLine.split('=')[1] != "SUCCESS"): break

                            if(newLine[0:11] == keyAttribute):
                                child.add(newLine.split('=')[1])
                                break

                            terminateCtr+=1 # defend of infinite loop
            md5RelationDict[trace] = child
        
        addedList = list()
        for trace, childSet in md5RelationDict.items():
            if trace not in addedList:
                if len(childSet) > 0:
                    tempList = list()
                    for child in childSet:
                        fileName = getOriginalFileName(child, md5RelationDict.keys())

                        # maybe a child process doesn't in family directory.(due to profiling bug)
                        if(fileName is None): continue
                        addedList.append(fileName)
                        if len(md5RelationDict[fileName]) > 0:
                            gcList = list()
                            for pid in md5RelationDict[fileName]:
                                gcFile = getOriginalFileName(pid, md5RelationDict.keys())
                                if(gcFile is None): continue
                                gcList.append(gcFile)
                                addedList.append(gcFile)
                            if gcList:
                                tempList.append(fileName)
                            else:
                                tempList.append({fileName: gcList})
                        else:
                            tempList.append(fileName)
                    if tempList:
                        result[trace] = tempList
                        
    return result

In [None]:
def moveOtherMainProcs(familyPath, mainDir):
    files = os.listdir(familyPath)
    for mainProc in files:
        if os.path.isfile(mainProc):
            shutil.move(familyPath+mainProc, mainDir)

In [None]:
def separateProcessByGeneration(familyPath, relationDict):
    mainDir = familyPath + 'main/'
    childDir = familyPath + 'child/'
    grandChildDir = familyPath + 'grand child/'
    print(familyPath)
    
    for key, value in relationDict.items():
        mainProc = key
        if not os.path.isdir(mainDir): os.makedirs(mainDir)
        shutil.move(familyPath+mainProc, mainDir)
        
        for sampleTrace in value:
            if type(sampleTrace) is dict:
                if not os.path.isdir(childDir): os.makedirs(childDir)
                if not os.path.isdir(grandChildDir): os.makedirs(grandChildDir)
                for child, grandChilds in sampleTrace.items():
#                     print("child - " , child)
                    shutil.move(familyPath+child, childDir)
                    for grandChild in grandChilds:
#                         print("GrandChild - " , grandChild)
                        shutil.move(familyPath+grandChild, grandChildDir)
                
            else: # child
                if not os.path.isdir(childDir): os.makedirs(childDir)
                shutil.move(familyPath+sampleTrace, childDir)
                
    moveOtherMainProcs(familyPath, mainDir)

In [None]:
familyDirs = os.listdir(family_root_dir)
for familyDir in familyDirs:
    familyPath = family_root_dir+familyDir
    if os.path.isdir(familyPath):
        sampleMD5Dict = dict()
        for root, dirs, files in os.walk(familyPath):
            for fEntry in files:
                if(fEntry == '.DS_Store'): continue # MacOS file system file.

                md5 = fEntry.split("_")[0]
                if sampleMD5Dict.get(md5):
                    sampleMD5Dict[md5].append(fEntry)
                else:
                    sampleMD5Dict[md5] = [fEntry]
#         print(sampleMD5Dict)
        relationDict = getSampleRelation(familyPath + '/' , sampleMD5Dict)
#         print("family-" + familyDir + ":\n", relationDict)
        
        separateProcessByGeneration(familyPath + '/', relationDict)