In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from openpyxl import load_workbook

#Relative Entropy

In [None]:
def relativeEntropy(matrix1, matrix2, size):
  val1 = np.array(matrix1)
  val2 = np.array(matrix2)
  sum = 0
  for x in range(0,size):
    for y in range(0,4):
      valComp = val1[x][y]* np.log(val1[x][y]/val2[x][y])
      sum = sum + valComp
  return sum

In [None]:
#base, ref matrix
def calc_relative_entropy(path1, path2):
  matrix1 = []

  fileMatrix1 = open(path1, 'r')
  for line in fileMatrix1.readlines():
      if line.startswith('>') or line.startswith('<'):
        continue
      else:
        matrix1.append(line.strip().split(' ')[0:4])
  fileMatrix1.close()

  matrix2 = []

  fileMatrix2 = open(path2, 'r')
  for line in fileMatrix2.readlines():
      if line.startswith('>') or line.startswith('<'):
        continue
      else:
        matrix2.append(line.strip().split(' ')[0:4])

  fileMatrix2.close()

  valueMatrix1 = ( [list( map(float,i) ) for i in matrix1] )
  valueMatrix2 = ( [list( map(float,i) ) for i in matrix2] )
  
  #sum of rows should all be the same unless something is really wrong.
  valueMatrix1Sum = sum(valueMatrix1[0])
  valueMatrix2Sum = sum(valueMatrix2[0])

  valueMatrix1ToCompute = [[x/valueMatrix1Sum for x in lst] for lst in valueMatrix1]
  valueMatrix2ToCompute = [[x/valueMatrix2Sum for x in lst] for lst in valueMatrix2]

  #remove zeros, piazza for epsilon, avoid div by 0
  valueMatrix1ToComputeNoZero = [[value + 0.0001 for value in row] for row in valueMatrix1ToCompute]
  valueMatrix2ToComputeNoZero = [[value + 0.0001 for value in row] for row in valueMatrix2ToCompute]
  
  valueMatrix1Sum = sum(valueMatrix1ToComputeNoZero[0])
  valueMatrix2Sum = sum(valueMatrix2ToComputeNoZero[0])

  valueMatrix1ToComputeNoZero = [[x/valueMatrix1Sum for x in lst] for lst in valueMatrix1ToComputeNoZero]
  valueMatrix2ToComputeNoZero = [[x/valueMatrix2Sum for x in lst] for lst in valueMatrix2ToComputeNoZero]

  res =relativeEntropy(valueMatrix1ToComputeNoZero, valueMatrix2ToComputeNoZero, len(valueMatrix1ToComputeNoZero))
  return res

In [None]:
'''result_dir = '/content/drive/My Drive/CS466/Project/Result/Dataset-1.2'
for dataset_no in range(1, 71):
  data_dir = result_dir + '/' + str(dataset_no)
  path1 = data_dir + '/predictedmotif.txt' 
  path2 = data_dir + '/motif.txt'
  entropy = calc_relative_entropy(path1, path2)
  print('dataset_no', dataset_no, 'entropy', entropy)'''

#Site Overlap

In [None]:
def calc_site_overlap(path1, path2):
  file1 = open(path1, 'r')
  pos1 = file1.readline().strip().split(',')
  pos1 = [int(pos) for pos in pos1]

  file2 = open(path2, 'r')
  pos2 = file2.readline().strip().split(',')
  pos2 = [int(pos) for pos in pos2]

  #print(pos1)
  #print(pos2)

  if(len(pos1) != len(pos2)):
    print('ERROR! Size mismatch!!')
    return -1
  
  correct = 0
  total = len(pos1)

  for i in range(total):
    if(pos1[i] == pos2[i]):
      correct += 1

  correct = (correct / total) * 100

  return correct


In [None]:
'''result_dir = '/content/drive/My Drive/CS466/Project/Result/Dataset-1.5'
for dataset_no in range(1, 71):
  data_dir = result_dir + '/' + str(dataset_no)
  path1 = data_dir + '/predictedsites.txt' 
  path2 = data_dir + '/sites.txt'
  overlap = calc_site_overlap(path1, path2)
  print('dataset_no', dataset_no, 'overlap', overlap)'''

# IC & Running Time

In [None]:
def calc_info_content_runtime(path1, path2):
  file1 = open(path1, 'r')
  ic = float(file1.readline().strip())
  file1.close()

  file2 = open(path2, 'r')
  rt = float(file2.readline().strip())
  file2.close()

  return ic, rt


In [None]:
'''result_dir = '/content/drive/My Drive/CS466/Project/Result/Dataset-1.5'
for dataset_no in range(1, 71):
  data_dir = result_dir + '/' + str(dataset_no)
  path = data_dir + '/summary.txt' 
  ic, rt = calc_info_content_runtime(path)
  print('dataset_no', dataset_no, 'ic', ic, 'rt', rt)'''

#Evaluation

In [None]:
def evaluate(dataset_name):
  print('Evaluating', dataset_name)
  
  result_dir = '/content/drive/My Drive/CS466/Project/Result/' + dataset_name

  metrics = []

  for dataset_no in range(1, 71):
    data_dir = result_dir + '/' + str(dataset_no)

    motif_path1 = data_dir + '/predictedmotif.txt' 
    motif_path2 = data_dir + '/motif.txt'
    entropy = calc_relative_entropy(motif_path1, motif_path2)
    
    site_path1 = data_dir + '/predictedsites.txt' 
    site_path2 = data_dir + '/sites.txt'
    overlap = calc_site_overlap(site_path1, site_path2)
    
    ic_path = data_dir + '/ic.txt' 
    rt_path = data_dir + '/runtime.txt' 
    ic, rt = calc_info_content_runtime(ic_path, rt_path)

    metrics.append([dataset_no, entropy, overlap, ic, rt])

    print('dataset_no', dataset_no, 'entropy', entropy, 'overlap', overlap, 'ic', ic, 'rt', rt)


  book = load_workbook(result_dir + '/' + dataset_name + '.xlsx')
  writer = pd.ExcelWriter(result_dir + '/' + dataset_name + '.xlsx', engine = 'openpyxl')
  writer.book = book

  metric_df = pd.DataFrame(data=metrics, columns=['dataset_no', 'entropy', 'overlap', 'info_cont', 'runtime'])

  metric_df.to_excel(writer, sheet_name = 'Metrics', index=False)
  writer.save()
  writer.close()

In [None]:
#dataset_names = ['Dataset-1.1', 'Dataset-1.2', 'Dataset-1.3', 'Dataset-1.5']
dataset_names = ['Dataset-2.4']
for dataset_name in dataset_names:
  evaluate(dataset_name)

#IC Correlation