In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from typing import Sequence
import sys
import os
from random import randrange, shuffle
import numpy as np
from argparse import ArgumentParser, ArgumentError
import pandas as pd
import shutil

In [None]:
working_dir_name = 'Dataset-2'
data_dir = '/content/drive/My Drive/CS466/Project/Data/' + working_dir_name
if(os.path.exists(data_dir)):
    shutil.rmtree(data_dir)
os.mkdir(data_dir)
array = []

In [None]:
#Step 2: Generate SC # of Sequences of {A, C, G, T } where len(SC[1,2,3,...]) is SL
def generateScSequences(SC, SL):
  sequeneceList = []
  nuc_string = []
  nucleotides = ['A','C','G','T']
  for list_num in range(0, SC):
    for list_val in range(0, SL):
      irand = randrange(0,4)
      nuc_string.append(nucleotides[irand])
    sequeneceList.append(''.join(map(str,nuc_string)))
    nuc_string = []
  return sequeneceList

#Step 3: Generate Motifs for each Sequence, there are SC sequences and ML is the length of each motif
def generateMotifs(randomMotif, SC, preferred_nuc_prob, remaining_nuc_prob):
  nucleotides = ['A','C','G','T']
  motifs_seeded = []
  #Iterate through the random motif, look at nucleotide, since it is random
  #with uniform distribution, use it to create preferred -> remaining mapping
  #choose from this mapping to create random distributions of characters to user for binding sites
  for i in range (0, len(randomMotif)):
    if randomMotif[i] == 'A':
      nuc_mapping = [preferred_nuc_prob] + [remaining_nuc_prob]*3
      nuc_string1 = []
      for list_num in range(0, SC):
        nuc_rand = np.random.choice(nucleotides, p= nuc_mapping)
        nuc_string1.append(str(nuc_rand))
      motifs_seeded.append(''.join(map(str,nuc_string1)))

    elif randomMotif[i] == 'C':
      nuc_mapping = [remaining_nuc_prob] + [preferred_nuc_prob] + [remaining_nuc_prob]*2
      nuc_string2 = []
      for list_num in range(0, SC):
        nuc_rand = np.random.choice(nucleotides, p= nuc_mapping)
        nuc_string2.append(str(nuc_rand))
      motifs_seeded.append(''.join(map(str,nuc_string2)))

    elif randomMotif[i] == 'G':
      nuc_mapping = [remaining_nuc_prob] *2 + [preferred_nuc_prob] + [remaining_nuc_prob]
      nuc_string3 = []
      for list_num in range(0, SC):
        nuc_rand = np.random.choice(nucleotides, p= nuc_mapping)
        nuc_string3.append(str(nuc_rand))
      motifs_seeded.append(''.join(map(str,nuc_string3)))

    else:
      nuc_mapping =  [remaining_nuc_prob]*3+ [preferred_nuc_prob]
      nuc_string4 = []
      for list_num in range(0, SC):
        nuc_rand = np.random.choice(nucleotides, p= nuc_mapping)
        nuc_string4.append(str(nuc_rand))
      motifs_seeded.append(''.join(map(str,nuc_string4)))
  

  #Now, a probability distribution or seeding is created, sample from it
  #to create SC binding sites of length ML
  motifs_calc = []
  for i in range(0, SC):
    m1 = [item[i] for item in motifs_seeded]
    motifs_calc.append(''.join(map(str,m1)))

  #print('motifs_seeded', motifs_seeded)
  #print('motifs_calc', motifs_calc)
  return motifs_seeded, motifs_calc

  
#Step 8, add functionality for probability later.
def calculatePWM(generatedMotifs, ML, dataset_dir):
  motif_file = open(dataset_dir + "/motif.txt", "w")
  motif_file.write('>MOTIF1 ' + str(ML)+'\n')
  for i in range(0, ML):
    m1 = [item[i] for item in generatedMotifs]
    countA = str(m1).count('A')
    countG = str(m1).count('C')
    countC = str(m1).count('G')
    countT = str(m1).count('T')
    motif_file.write(str(countA) + ' ' + str(countG) + ' '+ str(countC) + ' '+ str(countT) + 
                     '\n')
  motif_file.write('<')
  motif_file.close()

  motif_file_len = open(dataset_dir + '/motiflength.txt',
                        'w')
  motif_file_len.write(str(ML))
  motif_file_len.close()
  #print(countA, countG, countC, countT)
  
def plant_motif_binding(sampleSites, sequenceList, delta, motifOG, dataset_dir):
  start_loc_hist= []
  start_and_end_loc_motif_used_hist=[]

  choiceList = list(range(len(sequenceList)))
  shuffle(choiceList)
  
  for i in range(len(sequenceList)):
    #motif_to_bind = np.random.choice(sampleSites)
    motif_to_bind = sampleSites[choiceList[i]]
    
    start_loc = randrange(0, delta)
    end_loc = start_loc + len(motif_to_bind)
    start_loc_hist.append(start_loc)
    start_and_end_loc_motif_used_hist.append((start_loc, end_loc, motif_to_bind))
    sequenceList[i] = sequenceList[i][0:start_loc]+motif_to_bind+sequenceList[i][start_loc+len(motif_to_bind):]
  sites_file = open(dataset_dir + '/sites.txt','w')
  #print(start_loc_hist)
  #print(",".join(str(start_loc_hist)))
  sites_file.write(",".join(map(str,start_loc_hist)))
  sites_file.close()
  
  debug_file = open(dataset_dir + '/debug.txt',
                    'w')
  debug_file.write('Please find the format of this file (<START_POS>, <END_POS>, <MOTIF>), showing all planted motifs.\n'+
                   'At the end, is the real motif, which is used to generate these other motifs via sampling with approriate probability\n')
  debug_file.write(''.join(map(str,start_and_end_loc_motif_used_hist)) + '\n')
  debug_file.write(motifOG)
  debug_file.close()

  sequences_file = open(dataset_dir + '/sequences.fa','w')
  count=0
  for sequence in sequenceList:
    sequences_file.write('>seq'+ str(count)+'\n'+ sequence + '\n')
    count+=1
  return sequenceList, start_loc_hist,start_and_end_loc_motif_used_hist  

def main(ICPC, ML, SC, SL, turn, dataset_no):
  '''
  ICPC = 1.5
  ML = 8
  SL = 20
  SC = 20
  #Use ArgParser, sanitizies input
  argparse = ArgumentParser()
  argparse.add_argument('ICPC', type=float)
  argparse.add_argument('ML', type=int)
  argparse.add_argument('SL', type = int)
  argparse.add_argument('SC', type = int)
  values = argparse.parse_args()
  #Basic checks that argparser wont do
  if float(values.ICPC) not in [1, 1.5, 2]:
    raise ValueError
  else:
    ICPC = float(values.ICPC)
#Check for Positive Integers Only in ML, SL, and SC
  if int(values.ML) < 0 or int(values.SL) < 0 or int(values.SC) < 0:
    raise ValueError
  else:
    ML = int(values.ML)
    SL = int(values.SL)
    SC = int(values.SC)'''  
  
  #dataset_no = 'ICPC_' + str(ICPC) + '_ML_' + str(ML) + '_SC_' + str(SC) + '_TURN_' + str(turn)
  array.append([dataset_no, ICPC, ML, SC, SL, turn])
  print('Dataset No.', dataset_no, 'ICPC', ICPC, 'ML', ML, 'SC', SC, 'SL', SL)

  dataset_dir = data_dir + '/' + str(dataset_no)
  os.mkdir(dataset_dir)
  
  file = open(dataset_dir + '/params.txt', 'w')
  file.write('Dataset No. ' + str(dataset_no))
  file.write('ICPC ' + str(ICPC) + '\n')
  file.write('ML ' + str(ML) + '\n')
  file.write('SC ' + str(SC) + '\n')
  file.write('SL ' + str(SL) + '\n')
  file.close()

  file = open(dataset_dir + '/ICPC.txt', 'w')
  file.write(str(ICPC))
  file.close()

  file = open(dataset_dir + '/ML.txt', 'w')
  file.write(str(ML))
  file.close()

  file = open(dataset_dir + '/SC.txt', 'w')
  file.write(str(SC))
  file.close()


#Step 2:
  sequenceList = generateScSequences(SC, SL)

#Step 3:
#Figure out probability of preffered nucleotide, and remaining ones,
  ICPC_TO_P_MAPPING = dict([(1, .8105), (1.5, .9245), (2, 1)])
  preferred_nuc_prob = ICPC_TO_P_MAPPING[ICPC]
  remaining_nuc_prob = (1-preferred_nuc_prob) / 3.0

#Generate a Random Motif with inital uniform frequency on ACGT, use that to generate probability of each nuc at a position given the p values from ICPC
  randomMotif = generateScSequences(1, ML)[0]
  length_randomMotif = len(randomMotif)
  #print('The Random Motif is ' + randomMotif + ' with len '+ str(length_randomMotif) + '\n')
#Write this to motif.txt, along with length

#Step 4:Generate SC binding sites. Generate SC motifs by sampling from the random motif.
#Choose a nucleotide at random, and assign it prob preffered, and rest reamining, generate a motif for each sequence, SC Motifs, each with len ML
  generatedMotifs_seeding, generatedMotifs = generateMotifs(randomMotif, SC, preferred_nuc_prob, remaining_nuc_prob)



#Step 5: Plant generated motifs at sample sites in each sequence, total is SC
  sequenceList, start_loc_hist, full_debug_list = plant_motif_binding(generatedMotifs, sequenceList, SL-ML, randomMotif, dataset_dir)

#Step 8 can be done here.
#Compute Frequency Counts for PWM, can do either probability or count of char, horizontal of matrix is A C G T, veritcal is Count or prob.
  calculatePWM(generatedMotifs, ML, dataset_dir)
  #print('The following are seedings for the binding sites, change the ICPC to change dist here, each column is row in PWM so use transpose of matrix to understand:\n')
  #print( generatedMotifs_seeding)
  #print('The following are SC binding sites, generated from sampling from the motif above\n' )
  #print( generatedMotifs)



#Main Function, arguments taken from command line.
if __name__ == "__main__":

  default_ICPC = 2
  default_ML = 8
  default_SC = 10
  default_SL = 500

  ICPC_choice = [1, 1.5]
  ML_choice = [6, 7]
  SC_choice = [5, 20]

  dataset_no = 1

  for i in range(1, 11):
    main(default_ICPC, default_ML, default_SC, default_SL, i, dataset_no)
    dataset_no += 1

  for choice in ICPC_choice:
    if(choice != default_ICPC):
      for i in range(1, 11):
        main(choice, default_ML, default_SC, default_SL, i, dataset_no)
        dataset_no += 1

  for choice in ML_choice:
    if(choice != default_ML):
      for i in range(1, 11):
        main(default_ICPC, choice, default_SC, default_SL, i, dataset_no)
        dataset_no += 1
        
  for choice in SC_choice:
    if(choice != default_SC):
      for i in range(1, 11):
        main(default_ICPC, default_ML, choice, default_SL, i, dataset_no)
        dataset_no += 1

  df = pd.DataFrame(array, columns=['Dataset_No.', 'ICPC', 'ML', 'SC', 'SL', 'Turn'])
  writer = pd.ExcelWriter(data_dir + '/' + working_dir_name +  '.xlsx')
  df.to_excel(writer, sheet_name='Params', index=False) 
  writer.save()
  writer.close() 