diff --git a/trs2txt.py b/trs2txt.py index 4bd194b..58a6e3a 100644 --- a/trs2txt.py +++ b/trs2txt.py @@ -6,8 +6,9 @@ import glob import datetime import re +import io -config = open('trs2txt.cfg', 'r') # open the config file located in the same directory +config = io.open('trs2txt.cfg', 'r') # open the config file located in the same directory ref = '' # create a blank string to store the 'ref' tag from the config file for processing tbeg = '' # create a blank string to store the 'tbeg' tag from the config file for processing @@ -41,7 +42,7 @@ ver += line[29:-1]+' ' # append it to the 'text' string if line[0:22] == 'text for translation: ': # read the tag that identifies the translation subtitles trans += line[22:-1]+' ' # append it to the 'trans' string - + filenames = [] # create an empty list called 'filenames' to keep track of the .txt files in the directory for index, file in enumerate(glob.glob("*.trs")): # use glob to create an enumerated list of the .txt files in the directory @@ -51,9 +52,9 @@ trsfile = open(infile,'r') # Open each .trs file in 'read' mode textfile = open(str(infile[0:-3])+'txt','w') # create a corresponding .txt file in 'write' mode to store the values we want from the .trs file - + count = 0 # create a count value to keep track of lists - + timecodes = [] # create a list value to keep track of timecodes speaker = [] # create a list value to keep track of speakers speakturn = '' # create a string value to keep track of speaker turns @@ -62,78 +63,99 @@ lines = [] # create a list value to keep track of text lines textfile.write(str('\\_sh v3.0 400 ELAN\n\\_DateStampHasFourDigitYear\n\n')) # write the header of the Toolbox file - - for s in trsfile: # get all the lines we want from the .trs file and write them into a corresponding (new) .txt file + + for line in trsfile: # get all the lines we want from the .trs file and write them into a corresponding (new) .txt file try: # these 'try' loops are basically to ensure that if there are any lines that don't exist in the .trs file, they get ignored. Without these loops, those .trs files where different speakers weren't annotated would break the program. - if audiostart in s: # get the filename for the audio - result = re.search('%s(.*)%s' % (audiostart, audioend), s).group(1) + if audiostart in line: # get the filename for the audio + result = re.search('%s(.*)%s' % (audiostart, audioend), line).group(1) audioname = result.replace(' ', '_') # replace spaces in the audio filename with underscores textfile.write('\\id '+result+'\n') # write the \id of the file using the audio filename except: pass try: - if section in s: # get the endtime of the sound file - complete = re.search('%s(.*)%s' % (endtime, endings), s).group(1) # story it in the 'complete' variable + if section in line: # get the endtime of the sound file + complete = re.search('%s(.*)%s' % (endtime, endings), line).group(1) # story it in the 'complete' variable except: pass try: - if turnbeg+starttime in s: # get the first speaker's turn - spone = re.search('%s(.*)%s' % (starttime, endings+' '), s).group(1) # get the start of the speaker's turn + if turnbeg+starttime in line: # get the first speaker's turn + spone = re.search('%s(.*)%s' % (starttime, endings+' '), line).group(1) # get the start of the speaker's turn speakvalstart.append(spone) # store it in a list - sptwo = re.search('%s(.*)%s' % (endtime, endings+' '), s).group(1) # get the end of the speaker's turn + sptwo = re.search('%s(.*)%s' % (endtime, endings+' '), line).group(1) # get the end of the speaker's turn speakvalend.append(sptwo) # store it in another list - speak = re.search('%s(.*)%s' % (speakstart, endings+'>\n'), s).group(1) # get the name of the speaker + speak = re.search('%s(.*)%s' % (speakstart, endings+'>\n'), line).group(1) # get the name of the speaker speakturn = speak # set the current value of the string variable 'speakturn' to the speaker name except: pass try: - if turnbeg+speakstart in s: # get a non-first speaker's turn - speak = re.search('%s(.*)%s' % (speakstart, endings+' start'), s).group(1) # get the name of the speaker + if turnbeg+speakstart in line: # get a non-first speaker's turn + speak = re.search('%s(.*)%s' % (speakstart, endings+' start'), line).group(1) # get the name of the speaker speakturn = speak # set the current value of the string variable 'speakturn' to the speaker name - spone = re.search('%s(.*)%s' % (starttime, endings+' '), s).group(1) # get the start of the speaker's turn + spone = re.search('%s(.*)%s' % (starttime, endings+' '), line).group(1) # get the start of the speaker's turn speakvalstart.append(spone) # store it in a list - sptwo = re.search('%s(.*)%s' % (endtime, endings), s).group(1) # get the end of the speaker's turn + sptwo = re.search('%s(.*)%s' % (endtime, endings), line).group(1) # get the end of the speaker's turn speakvalend.append(sptwo) # store it in another list except: pass try: - if '