In [1]:
# Primitive automated Stata dofile I-O mapper
# Code by Isaac Liu with help from M Z on Stack Overflow

# Currently supported

# Input detectors
# using
# use
# import delimited
# insheet

# Output detectors
# save
# outsheet

# Intermediates
# Check if the file is in the input list and output list

# Automatically skips lines beginning with *

# Future features:
# temp files?
# skip */ blocks

In [2]:
# Set path to common directory of dofiles
path = ''

# Set the filesDoc to location of a .txt containing files to analyze, one per line
# Use only the filename.
filesDoc = ''
with open(filesDoc, 'r') as f:
    dfs = f.readlines()

# Strip the newlines
dofile_names = []
for i in range(len(dfs)):
    dofile_names.append(dfs[i].strip('\n'))

In [3]:
# Set up the structure for dofile objects (a collection of lists)
class dofile:
    def __init__(self):
        self.name = ""
        self.lines = []
        self.inputs = []
        self.outputs = []
        self.intermediates = []
    
# Basic classes syntax: https://www.learnpython.org/en/Classes_and_Objects

In [4]:
# Code structure note: for the purposes of debugging, I am making many small loops.
# I understand that having one big loop probably runs faster.

In [5]:
# Create dofile objects based on names
dofiles = []
for name in dofile_names:
    df = dofile()
    df.name = name
    dofiles.append(df)

In [6]:
# Read in the files, line by line.
for dofile in dofiles:
    with open(path + dofile.name, 'r') as f:
        dofile.lines = f.readlines()

In [7]:
# Clean out all commented lines
for dofile in dofiles:
    for line in dofile.lines:
        if line[0] == '*':
            dofile.lines.remove(line)

# At the moment it appears far harder to detect comment blocks w/ /* notation, will fix later.

In [8]:
# Prep for RegEx detection
# Guide: https://developers.google.com/edu/python/regular-expressions

import re

In [9]:
# I structure the detected i/o as tuples, in case later we want to go back and retrieve the kind of input... 
# ie using, versus use etc.

In [10]:
# Input detectors

In [11]:
for dofile in dofiles:
    for line in dofile.lines:
        # using
        using_tups = re.findall(r'(using)\s([^\s,]+)', line)
        for tuple in using_tups:
            if tuple[1] not in dofile.inputs:
                dofile.inputs.append(tuple[1])

In [12]:
for dofile in dofiles:
    for line in dofile.lines:
        # use
        use_tups = re.findall(r'(use)\s+([^\s,]+)', line)
        for tuple in use_tups:
            if tuple[1] not in dofile.inputs:
                dofile.inputs.append(tuple[1])

In [13]:
for dofile in dofiles:
    for line in dofile.lines:
        # import delimited
        id_tups = re.findall(r'(import delimited)\s+([^\s,]+)', line)
        for tuple in id_tups:
            if tuple[1] not in dofile.inputs:
                dofile.inputs.append(tuple[1])

In [14]:
for dofile in dofiles:
    for line in dofile.lines:
        # insheet
        is_tups = re.findall(r'(insheet)\s+([^\s,]+)', line)
        for tuple in is_tups:
            if tuple[1] not in dofile.inputs:
                dofile.inputs.append(tuple[1])

In [15]:
# Output detectors

In [16]:
for dofile in dofiles:
    for line in dofile.lines:
        # save
        save_tups = re.findall(r'(save)\s+([^\s,]+)', line)
        for tuple in save_tups:
            if tuple[1] not in dofile.outputs:
                dofile.outputs.append(tuple[1])

In [17]:
for dofile in dofiles:
    for line in dofile.lines:
        # outsheet
        os_tups = re.findall(r'(outsheet)\s+([^\s,]+)', line)
        for tuple in os_tups:
            if tuple[1] not in dofile.outputs:
                dofile.outputs.append(tuple[1])

In [20]:
# Handle intermediates- file is in the input list and output list
for dofile in dofiles:
    for input in dofile.inputs:
        if input in dofile.outputs:
            if input not in dofile.intermediates:
                dofile.intermediates.append(input)
            dofile.inputs.remove(input)
            dofile.outputs.remove(input)

In [21]:
# Write the overall list of inputs and outputs to a .txt file
with open('dropbox_io-list.txt', 'w') as f:
    f.write("* Automatically generated Stata dofile I-O mappings\n")
    f.write("* code by Isaac Liu with help from M Z on Stack Overflow\n")
    f.write("\n")
    for dofile in dofiles:
        f.write("* " + dofile.name + "\n")
        f.write("* " + "Inputs:\n")
        for input in dofile.inputs:
            f.write("* " + input + "\n")
        f.write("* " + "Outputs:\n")
        for output in dofile.outputs:
            f.write("* " + output + "\n")
        f.write("* " + "Intermediates:\n")
        for intermediate in dofile.intermediates:
            f.write("* " + intermediate + "\n")
        f.write("\n")