In [1]:
# Imports
from openpyxl import load_workbook
import os
import networkx as nx
import matplotlib.pyplot as plt
from bokeh.models import Range1d, Circle, MultiLine, ColorBar, LinearColorMapper, LogColorMapper
from bokeh.models.layouts import TabPanel, Tabs
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.transform import linear_cmap, log_cmap
from bokeh.palettes import Inferno256
from bokeh.io import show, output_notebook
import json
import os
import time
import hashlib
from methods import (printProgressBar, build_graphs, build_plots, 
                     import_text, component_graphs, merge_datasets, tabbed_plot, 
                     plot_graph_bokeh, cleanList, get_sheetnames_xlsx,
                     sortDict, plotVariablesByFreq)


In [2]:
# Path for where your raw data is stored (.xlsx)
RawDataPath = r'C:\DC24_1\dataset_1_original_combined'

# Path for where your parsed data is stored (.json)
ParsedDataPath = r'C:\DC24_1\dataset_parsed_combined'

# Paths for where the media files reside
audioPath = r'C:\DC24_1\audio'
videoPath = r'C:\DC24_1\video'
docsPath =  r'C:\DC24_1\doc'
imagePath = r'C:\DC24_1\image'

# Path for where you want your output data to be stored (.txt)
dataOut = r'James_Code\data'

In [3]:
# Initialize
xlsxDicionary = {}

'''
For every file in the specified location, try to obtain the sheet names. If the file is not of .xlsx format, 
then catch the error as it is either corrupted or needs to be reformatted (to be implemented later).
Then update the dictionary
'''

l = len(os.listdir(RawDataPath))

for p,fileName in enumerate(os.listdir(RawDataPath)):
    filepath = RawDataPath + '/' + fileName

    '''
    We kept running into files that were either corrupt or .xls files, 
    so we just skipped over those files. Luckily, in this particular dataset,
    there were only 3 files of this nature. This was also our reasoning to
    pivot towards the parsed datasets.
    '''
    
    try:
        sheetnames = list(get_sheetnames_xlsx(filepath))
    except:
        print(fileName + " is not the proper format (.xlsx). It is either corrupted or needs to be reformatted.")
    xlsxDicionary[fileName] = sheetnames
    time.sleep(0.1)
    printProgressBar(p+1, l ,prefix = 'Progress:', suffix = 'Complete', length = 50)


file_30.xls is not the proper format (.xlsx). It is either corrupted or needs to be reformatted.
file_652.xls is not the proper format (.xlsx). It is either corrupted or needs to be reformatted.
file_711.xls is not the proper format (.xlsx). It is either corrupted or needs to be reformatted.
Progress: |████████████████████████████████████████████████--| 97.9% Complete

Below, the sheetnames within the excel file are plotted by the frequency that they appear in the dataset.

In [None]:
#plots the type of data in the files by the frequency
plotVariablesByFreq(xlsxDicionary)

Audio Section:
=============

In [None]:
#initialize a temporary list, and a master dictionary
audioList = []
masterDictAudio = {}

#for progress bar
l = len(os.listdir(ParsedDataPath))
printProgressBar(0,l,prefix = 'Progress:', suffix = 'Complete', length = 50)

# Each .json file in the path represents a device, we need to navigate to the Audio files stored on the device, and collect all the MD5 hash values
for j,file in enumerate(os.listdir(ParsedDataPath)):
    hashList = []
    with open(ParsedDataPath + '/' + file) as json_file:
        json_data = json.load(json_file)
    for item in json_data:
        if item.get('Audio'): 
            audioList = (item.get('Audio'))
            for i in range (len(audioList)):
                if audioList[i].get('md5'):
                    hashList.append(audioList[i].get('md5'))
                masterDictAudio[(json_file.name).split('/')[-1]] = cleanList(hashList)
    
    #part of progress bar graphic
    time.sleep(0.1)
    printProgressBar(j+1, l ,prefix = 'Progress:', suffix = 'Complete', length = 50)

#write the audio master dictionary data to .txt file for later use
with open(r'data/audiologs.txt','w+') as f:
    f.write(str(masterDictAudio))


In [None]:
#initialize
Audiomd5 = {}

'''
 We have a folder full of audio files. The names of the files may not be helpful for linking devices together as users 
 can easily change the names of files independently. Instead, we will look at the MD5 hash values of the each file.
'''

#for each file in the audio folder, get the MD5 hash value and append it to the dictionary respective to the file name.
for thing in os.listdir(audioPath):

    #calculates the MD5 hash of the given file. md5 is a dictionary of format {'audio file name' : ['MD5 Hash value']}
    Audiomd5[thing] = [hashlib.md5(open(audioPath + '/' + thing,'rb').read()).hexdigest()]

#write the md5 data to .txt file for later use
with open(r'data/AudioMD5logs.txt','w+',encoding='utf-8') as f:
    f.write(str(Audiomd5))

# Importing datasets from .txt files
datasetsAudio = {}
datasetsAudio['Audio to MD5'] = (import_text(r'data/AudioMD5logs.txt'))
datasetsAudio['All MD5'] = (import_text(r'data/audiologs.txt'))

# Finds the common MD5 hash values bewteen the ones we calculated, and the ones that were found in the dataset
# Creates a list of the hash values from our audio file folder
hashVal = list(datasetsAudio['Audio to MD5'].values())
deviceWithPosMD5 = {}

#for all the MD5 values in the dataset, and for all calculated hash values, if the calculated hash value exists in the dataset, 
#   append to a list that correlates with the given file name in dictionary
# deviceWithPosMD5 has format {'.json file name' : [list of matched/common MD5 values]}
for k,m in datasetsAudio['All MD5'].items():
    tempList = []
    for val in hashVal:
        tempString = str(val)
        tempString = tempString.strip("]").strip("'").strip("[").strip("'")
        if tempString in m:
            tempList.append((tempString))
        deviceWithPosMD5[k] = tempList 

#write the deviceWithPosMD5 data to .txt file for later use
with open(r'data/devWithPosAudioMD5.txt','w+') as f:
    f.write(str(deviceWithPosMD5))


'''
When we draw the graphs, Devices with MD5 with no links attached represent devices (or .json files) with no common MD5 hash values.

Our Audio to MD5 graph will just link audio file names to their calculated MD5 hash value.

The merged graph draws both prior graphs onto 1 pane and connects common nodes.
'''

#initialize a new diciontary for drawing the graphs. All we care about are the files/devices that share MD5 hash values with our audio files 
finalDatasetAudio = {}

# Importing datasets from .txt files
finalDatasetAudio['Devices with MD5'] = (import_text(r'data/devWithPosAudioMD5.txt'))
finalDatasetAudio['Audio file -> MD5'] = (import_text(r'data/AudioMD5logs.txt'))

# Merge the dictionaries into one
finalDatasetAudio['merged'] = merge_datasets(finalDatasetAudio['Audio file -> MD5'], finalDatasetAudio['Devices with MD5'])

# Generating networkx graphs for each dataset
graphsAudio = build_graphs(finalDatasetAudio)
build_plots(graphsAudio)

# Calculate the 8 largest components. The 9th largest becomes just 1 link between 2 nodes
components = component_graphs(graphsAudio['merged'], components_selection=9)
node_highlights = finalDatasetAudio['merged'].keys()

output_notebook()
 
plot = figure(title='Update Example', x_axis_label='Time', y_axis_label='Value')
line = plot.line([], [])
handle = show(tabbed_plot(finalDatasetAudio['Devices with MD5'],components, node_highlights=node_highlights))

Video Section:
==============

In [None]:
#initialize lists, masterDict, and path
videoList = []
masterDictVideo = {}


l = len(os.listdir(ParsedDataPath))
printProgressBar(0,l,prefix = 'Progress:', suffix = 'Complete', length = 50)

# Each .json file in the path represents a device, we need to navigate to the video files stored on the device, and collect all the MD5 hash values
for j,file in enumerate(os.listdir(ParsedDataPath)):
    hashList = []
    with open(ParsedDataPath + '/' + file) as json_file:
        json_data = json.load(json_file)
    for item in json_data:
        if item.get('Videos'): 
            videoList = (item.get('Videos'))
            for i in range (len(videoList)):
                if videoList[i].get('md5'):
                    hashList.append(videoList[i].get('md5'))
                masterDictVideo[(json_file.name).split('/')[-1]] = cleanList(hashList)
    
    #part of progress bar graphic
    time.sleep(0.1)
    printProgressBar(j+1, l ,prefix = 'Progress:', suffix = 'Complete', length = 50)

#write the masterDict data to .txt file for later use
with open(r'data/videologs.txt','w+') as f:
    f.write(str(masterDictVideo))    

In [None]:
#initialize
md5Video = {}

# We have a folder full of video files. The names of the files may not be helpful for linking devices together as users 
#   can easily change the names of files independently. Instead, we will look at the MD5 hash values of the each file.
#for each file in the video folder, get the MD5 hash value and append it to the dictionary respective to the file name.
for y in os.listdir(videoPath):

    #calculates the MD5 hash of the given file. md5 is a dictionary of format {'video file name' : ['MD5 Hash value']}
    md5Video[y] = [hashlib.md5(open(videoPath + '/' + y,'rb').read()).hexdigest()]

#write the md5 data to .txt file for later use
with open(r'data/VideoMD5logs.txt','w+',encoding='utf-8') as f:
    f.write(str(md5Video))

# Importing datasets from .txt files
datasetsVideo = {}
datasetsVideo['Video to MD5'] = (import_text(r'data/VideoMD5logs.txt'))
datasetsVideo['All MD5'] = (import_text(r'data/videologs.txt'))

# Creates a list of the hash values from our video file folder
hashVal = list(datasetsVideo['Video to MD5'].values())
deviceWithPosMD5 = {}

#for all the MD5 values in the dataset, and for all calculated hash values, if the calculated hash value exists in the dataset, 
#   append to a list that correlates with the given file name in dictionary
# deviceWithPosMD5 has format {'.json file name' : [list of matched/common MD5 values]}
for k,m in datasetsVideo['All MD5'].items():
    tempList = []
    for val in hashVal:
        tempString = str(val)
        tempString = tempString.strip("]").strip("'").strip("[").strip("'")
        if tempString in m:
            tempList.append((tempString))
        deviceWithPosMD5[k] = tempList 

#write the deviceWithPosMD5 data to .txt file for later use
with open(r'data/devWithPosVideoMD5.txt','w+') as f:
    f.write(str(deviceWithPosMD5))

#Intialize
finalDatasetVideo = {}

# Importing datasets from .txt files
finalDatasetVideo['Devices with MD5'] = (import_text(r'data/devWithPosVideoMD5.txt'))
finalDatasetVideo['Video file -> MD5'] = (import_text(r'data/VideoMD5logs.txt'))

# Merge the dictionaries into one
finalDatasetVideo['merged'] = merge_datasets(finalDatasetVideo['Video file -> MD5'], finalDatasetVideo['Devices with MD5'])

# Generating networkx graphs for each dataset
graphsVideo = build_graphs(finalDatasetVideo)
build_plots(graphsVideo)

# Calculate the 8 largest components. The 9th largest becomes just 1 link between 2 nodes
components = component_graphs(graphsVideo['merged'], components_selection=10)
node_highlights = finalDatasetVideo['merged'].keys()

output_notebook()
 
plot = figure(title='Update Example', x_axis_label='Time', y_axis_label='Value')
line = plot.line([], [])
handle = show(tabbed_plot(finalDatasetVideo['Devices with MD5'], components, node_highlights=node_highlights))

Merged audio and video graphs:
================================

In [None]:
# Merge the two datasets (Audio and Video) into one.
masterDataDict = {}

masterDataDict['Audio'] = finalDatasetAudio['merged']
masterDataDict['Video'] = finalDatasetVideo['merged']
masterDataDict['Audio and Video'] = merge_datasets(masterDataDict['Audio'],masterDataDict['Video'])

#this is just a reorganization of the dictionaries for the tabbed_plot() function
posMD5Dict = merge_datasets(finalDatasetVideo['Devices with MD5'],finalDatasetAudio['Devices with MD5'])

graphsMaster = build_graphs(masterDataDict)
build_plots(graphsMaster)

# Calculate the 8 largest components. The 9th largest becomes just 1 link between 2 nodes
components = component_graphs(graphsMaster['Audio and Video'], components_selection=10)
node_highlights = masterDataDict['Audio and Video'].keys()

output_notebook()
 
plot = figure(title='Update Example', x_axis_label='Time', y_axis_label='Value')
line = plot.line([], [])
handle = show(tabbed_plot(posMD5Dict, components, node_highlights=node_highlights))

Documents Section:
======================

In [None]:
#initialize lists, masterDict, and path
docsList = []
masterDictDocs = {}


l = len(os.listdir(ParsedDataPath))
printProgressBar(0,l,prefix = 'Progress:', suffix = 'Complete', length = 50)

# Each .json file in the path represents a device, we need to navigate to the docs files stored on the device, and collect all the MD5 hash values
for j,file in enumerate(os.listdir(ParsedDataPath)):
    hashList = []
    with open(ParsedDataPath + '/' + file) as json_file:
        json_data = json.load(json_file)
    for item in json_data:
        if item.get('Document'): 
            docsList = (item.get('Document'))
            for i in range (len(docsList)):
                if docsList[i].get('md5'):
                    hashList.append(docsList[i].get('md5'))
                masterDictDocs[(json_file.name).split('/')[-1]] = cleanList(hashList)
    
    #part of progress bar graphic
    time.sleep(0.1)
    printProgressBar(j+1, l ,prefix = 'Progress:', suffix = 'Complete', length = 50)

#write the masterDict data to .txt file for later use
with open(r'data/docslogs.txt','w+') as f:
    f.write(str(masterDictDocs))


In [None]:

#initialize
md5Docs = {}

# We have a folder full of docs files. The names of the files may not be helpful for linking devices together as users 
#   can easily change the names of files independently. Instead, we will look at the MD5 hash values of the each file.

#for each file in the docs folder, get the MD5 hash value and append it to the dictionary respective to the file name.
for y in os.listdir(docsPath):

    #calculates the MD5 hash of the given file. md5 is a dictionary of format {'docs file name' : ['MD5 Hash value']}
    md5Docs[y] = [hashlib.md5(open(docsPath + '/' + y,'rb').read()).hexdigest()]

#write the md5 data to .txt file for later use
with open(r'data/DocsMD5logs.txt','w+',encoding='utf-8') as f:
    f.write(str(md5Docs))

# Importing datasets from .txt files
datasetsDocs = {}
datasetsDocs['Docs to MD5'] = (import_text(r'data/DocsMD5logs.txt'))
datasetsDocs['All MD5'] = (import_text(r'data/docslogs.txt'))

# Creates a list of the hash values from our docs file folder
hashVal = list(datasetsDocs['Docs to MD5'].values())
deviceWithPosMD5 = {}

#for all the MD5 values in the dataset, and for all calculated hash values, if the calculated hash value exists in the dataset, 
#   append to a list that correlates with the given file name in dictionary
# deviceWithPosMD5 has format {'.json file name' : [list of matched/common MD5 values]}
for k,m in datasetsDocs['All MD5'].items():
    tempList = []
    for val in hashVal:
        tempString = str(val)
        tempString = tempString.strip("]").strip("'").strip("[").strip("'")
        if tempString in m:
            tempList.append((tempString))
        deviceWithPosMD5[k] = tempList 

#write the deviceWithPosMD5 data to .txt file for later use
with open(r'data/devWithPosDocsMD5.txt','w+', encoding="utf-8") as f:
    f.write(str(deviceWithPosMD5))

#Intialize
finalDatasetDocs = {}

# Importing datasets from .txt files
finalDatasetDocs['Devices with MD5'] = (import_text(r'data/devWithPosDocsMD5.txt'))
finalDatasetDocs['Docs file -> MD5'] = (import_text(r'data/DocsMD5logs.txt'))

# Merge the dictionaries into one
finalDatasetDocs['merged'] = merge_datasets(finalDatasetDocs['Docs file -> MD5'], finalDatasetDocs['Devices with MD5'])

# Generating networkx graphs for each dataset
graphsDocs = build_graphs(finalDatasetDocs)
build_plots(graphsDocs)

# Calculate the 8 largest components. The 9th largest becomes just 1 link between 2 nodes
components = component_graphs(graphsDocs['merged'], components_selection=5)
node_highlights = finalDatasetDocs['merged'].keys()

output_notebook()
 
plot = figure(title='Update Example', x_axis_label='Time', y_axis_label='Value')
line = plot.line([], [])
handle = show(tabbed_plot(finalDatasetDocs['Devices with MD5'], components, node_highlights=node_highlights))