# Generate array data

In [1]:
from datetime import date
import sys
import random
import os
import re

#strings
__author__ = "Jurre Hageman and Michiel Noback"
title = "Loading data to dataframes"
today = date.today()
number_of_files = 25
number_of_lines = 50
files_dir = "files"
rmd_file = "rcode.Rmd"
rmd_file_stripped = "rcode_stripped.Rmd"
helper_path = os.path.join("helper_files", "body.txt")

head = \
"""---
title: "{}"
author: "{}"
date: "{}"
---""".format(title, __author__, today)


with open(helper_path) as f:
    body = f.read()

R_block = \
"""```{{r}}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
```
"""

R_code_head = \
"""
my_dir <- "files
my_file <- "{}.Rmd"
my_path <- paste0(my_dir, "/", my_file)
my_data <- read.table(my_path,

"""

In [2]:
# Funtions that generate the file values
def get_comment_string():
    #comm = ["{}THIS IS A COMMENT STRING\n".format(random.choice("@ # $".split())) for i in range(4)] + [None]
    comm = ["{}THIS IS A COMMENT STRING\n".format(random.choice("@ # $".split())) * random.randint(1, 5) for i in range(4)] + [None]
    return random.choice(comm)

def get_na_string():
    NA_string = ["ND", "-", "no data", "?", "-1"]
    return random.choice(NA_string)

def get_dec(field_sep):
    if field_sep == ",":
        return "."
    else:      
        return random.choice((".", ","))

def get_header():
    header = ["transcript ID,fold induction,protein ID, protein length, family,location\n".split(",") for i in range(4)] + [None]
    return random.choice(header)

def get_location():
    location = "nucleus,cytosol,mitochondria,peroxisome,ER,golgi".split(",")
    return random.choice(location)

def get_NP_num():
    ID = "NP_0"
    ID_full = [ID]
    for i in range(5):
        ID_full.append(str(random.randint(0,9)))
    ID_full = "".join(ID_full)
    return ID_full

def get_protein_length():
    return random.randint(300, 1500)
    
def get_family():
    family = "nuclear receptor,TGF-B,MAPK,TNF,JUN,FOS,TIMP,P53,LIP,KIN,TER,KAR".split(",")
    return random.choice(family)

def get_NM_num():
    ID = "NM_0"
    ID_full = [ID]
    for i in range(5):
        ID_full.append(str(random.randint(0,9)))
    ID_full = "".join(ID_full)
    return ID_full

def get_fold_induction(dec):
    fold_induction = str(random.randint(1, 99))
    fold_induction += dec
    fold_induction += str(random.randint(1, 9))
    return fold_induction

def get_field_sep():
    field_sep = [",", ";", "\t"]
    return random.choice(field_sep)

def generate_line(dec):
    line = [get_NM_num(), get_fold_induction(dec), get_NP_num(), get_protein_length(), get_family(), get_location()]
    line = [str(i) for i in line]
    return line

def add_na_values(line, na):
    val = random.randint(1, 5)
    if val == 1:
        index_item = line.index(random.choice(line))
        line[index_item] = na
    return line
    


In [3]:
# File IO
def get_format_file_name(file_num, num_of_files):
    file_name = "file" + str(file_num + 1).zfill(len(str(num_of_files))) + ".txt"
    return file_name

def write_file(rows, file_name):    
    with open(file_name, "w") as f:
        for row in rows:
            f.write(row)

def generate_markdown_template(rmd_path, head, body):
    with open(rmd_path, "a") as f:
        f.write(head)
        f.write(body)
               
def generate_markdown_code(rmd_path, filename, comm_flag, comm_start, header_flag, field_sep, dec, na):
    my_dir = 'my_dir <- "files"'
    my_file = 'my_file <- "{}"'.format(filename)
    my_path = 'my_path <- paste0(my_dir, "/", my_file)'
    my_data = 'my_data <- read.table(my_path,'
    print_my_data = 'my_data'
    if comm_flag:
        comm = 'comment.char = "{}",'.format(comm_start)
    else:
        comm = ""
    if header_flag:
        header = 'header = T,'
    else:
        header = 'header = F,'
    if field_sep == "\t":
        field_sep = "\\t" #need to escape the \
    field_char = 'sep = "{}",'.format(field_sep)
    dec_sep = 'dec = "{}",'.format(dec)
    na_char = 'na.strings = "{}",'.format(na)
    as_is = 'as.is = c(1, 3),'
    close = ')'
    with open(rmd_path, "a") as f:
        f.write("\n## {}\n\n".format(filename))
        f.write(R_block.format(my_dir, my_file, my_path, my_data, comm, header, 
                               field_char, dec_sep, na_char, as_is, close, print_my_data))

        
def strip_rmarkdown(input_file, output_file):
    ## open for writing to output file
    output_stream = open(output_file, "w")
    ## read input Markdown file
    input_data = open(input_file, 'r')
    data = input_data.read()
    ## first mark KEEPIT chunks 
    data = re.sub(r'```(\{[^`]+?\})\n##KEEPIT##([^`]+?)```',
                    r'AAAA```\1\2```BBBB',
                    data,
                    flags = re.DOTALL)
    ## replace all other  chunks with placeholders 
    data = re.sub(r'[^A]```(\{.+?\}).+?```[^B]',
                    r'\n```\1\n#<YOUR CODE HERE>\n```\n',
                    data,
                    flags = re.DOTALL)
    ## finally remove markers of first round
    data = re.sub(r'A+```([^`]+?)```B+',
                r'\n```\1```',
                data,
                flags = re.DOTALL)
    ## write output data to stripped file
    output_stream.write(data)
    ## determine points per question
    ## format 1: "#### Question 1 (10 points)"
    ## format 2: "**Question 4 a (2 points)**"
    totalpoints = 0
    for line in open(input_file):
        m_q = re.search(r"(#+) *([Qq]uestion *(\d+)) *\((\d+)", line)
        if (m_q):
            #print(m.groups())
            question = m_q.group(3)
            points = int(m_q.group(4))
            print("Question:", question, "-- points:", points)
            totalpoints += points

    #print("Total points of test:", totalpoints)
    
    


In [4]:
# Main
def main():
    if not os.path.exists(files_dir):
        os.makedirs(files_dir)
    if os.path.exists(rmd_file): #delete Rmd files if they excist
        os.remove(rmd_file)  
    generate_markdown_template(rmd_file, head, body)    
    for file_num in range(number_of_files):
        comm_flag = False
        header_flag = False
        comm_start = None
        rows = []
        na = get_na_string()
        comm = get_comment_string()
        field_sep = get_field_sep()
        dec = get_dec(field_sep)
        if comm:
            rows.append(comm)
            comm_flag = True
            comm_start = comm[0]
        header = get_header()
        if header:
            header = field_sep.join(header)
            header_flag = True
            #print(header)
            rows.append(header)
        for row_num in range(number_of_lines):
            line = generate_line(dec)
            line = field_sep.join(add_na_values(line, na)) + "\n"
            #print(line)
            rows.append(line)       
        filename = get_format_file_name(file_num, number_of_files)
        filename_complete = os.path.join(files_dir, filename)
        write_file(rows, filename_complete)
        generate_markdown_code(rmd_file, filename, comm_flag, comm_start, header_flag, field_sep, dec, na)
        strip_rmarkdown(rmd_file, rmd_file_stripped)
    print("done")


main()

done
