# Overview

## The purpose of this project was to use the power of Python's built-in string processing methods and fundamental programming techniques learned to extract, process, transform, index and load target units of text from a semi structured data source format (Dracula by Brahms Stoker) to useful data stacks in the form of outfiles stored locally. Additionally, file names were formatted and built for each outfile by leveraging the semi-structured nature of the table of contents lines.




In [1]:
fhand = open("dracula.txt", "rt")  

my_file_string = fhand.read()

fhand.close()

### Exploit python's string processing methods and patterns in the data to identify start and stop markers and extract target unit of data -- the Table of Contents for further processing

In [2]:
toc_start_pos = my_file_string.find("CONTENTS")  # determined start and end points for the table of contents using indexing properties and string methods

toc_end_pos = my_file_string.find("338") + len("338")
toc_string = my_file_string[toc_start_pos:toc_end_pos] # grabbed "Table of Contents" useful target data slice 

toc_list = toc_string.split("\n")  # transformed "Table of Contents" unit into a list of words

target_word_list = []  # building desired word list for use in outfile names
dirty_file_name_word_list = [] # collect unprocessed word list which includes punctuation, etc..
for line in toc_list:
    print()
    print("line:\n" + line + "\n------\n")
    print()


# first view original "Table of Contents" unit list of words
    
    line.strip()  # strip out white space
    print("printing stripped line:")
    print(line)  # stripped line
    if len(line) > 0 and line[-1].isdigit():    # filter unprocessed word list to drop page numbers, target lines
        mini_word_list = line.split()
        dirty_file_name_word_list.append(" ".join(mini_word_list[:-1]))  # transform each target line into a string -> building file labels
        print("dirty_file_name_word_list:", dirty_file_name_word_list)
    else:
        print("not interested in this line")
print()

print()


line:
CONTENTS
------


printing stripped line:
CONTENTS
not interested in this line

line:

------


printing stripped line:

not interested in this line

line:

------


printing stripped line:

not interested in this line

line:
CHAPTER I
------


printing stripped line:
CHAPTER I
not interested in this line

line:
                                                                    Page
------


printing stripped line:
                                                                    Page
not interested in this line

line:

------


printing stripped line:

not interested in this line

line:
Jonathan Harker's Journal                                              1
------


printing stripped line:
Jonathan Harker's Journal                                              1
dirty_file_name_word_list: ["Jonathan Harker's Journal"]

line:

------


printing stripped line:

not interested in this line

line:
CHAPTER II
------


printing stripped line:
CHAPTER II
not interested in this line

### Build functions to preprocess, clean, and finalize the data and iterate through the extracted data structure to transform into desired output to create a set of finalized filenames for set of processed data outfiles

In [4]:
import string  # import string module


def clean_line(line_str):   # build function to process/clean "Table of Contents" lines and remove punctuation 


    cleanline = line_str
    for punc in string.punctuation:

        cleanline = cleanline.replace(punc, "")
        final_cleanline = cleanline.replace(" ", "_")
    return final_cleanline


def dress_line(final_cleanline):  # build function to return final output of outfile file names

    final_file_name = "Dracula-Chapter-" + str(chap_count) + "-" + final_cleanline + ".txt"
    return final_file_name

chap_count = 1  # initialize counter that will be used to number chapters in final labels
the_fancied_file_name_list = []

for file_name in dirty_file_name_word_list:  # append the cleaned and finalized file_names to the list of filenames
    print("1) file_name:", file_name)
    print("2) \n-----------\n")


    the_fancied_file_name_list.append(dress_line(clean_line(file_name)))  

    print("3 ) the_fancied_file_name_list:", the_fancied_file_name_list)

    print("list now:", the_fancied_file_name_list)
    chap_count += 1

print("4) the_fancied_file_name_list:", the_fancied_file_name_list)

1) file_name: Jonathan Harker's Journal
2) 
-----------

3 ) the_fancied_file_name_list: ['Dracula-Chapter-1-Jonathan_Harkers_Journal.txt']
list now: ['Dracula-Chapter-1-Jonathan_Harkers_Journal.txt']
1) file_name: Jonathan Harker's Journal
2) 
-----------

3 ) the_fancied_file_name_list: ['Dracula-Chapter-1-Jonathan_Harkers_Journal.txt', 'Dracula-Chapter-2-Jonathan_Harkers_Journal.txt']
list now: ['Dracula-Chapter-1-Jonathan_Harkers_Journal.txt', 'Dracula-Chapter-2-Jonathan_Harkers_Journal.txt']
1) file_name: Jonathan Harker's Journal
2) 
-----------

3 ) the_fancied_file_name_list: ['Dracula-Chapter-1-Jonathan_Harkers_Journal.txt', 'Dracula-Chapter-2-Jonathan_Harkers_Journal.txt', 'Dracula-Chapter-3-Jonathan_Harkers_Journal.txt']
list now: ['Dracula-Chapter-1-Jonathan_Harkers_Journal.txt', 'Dracula-Chapter-2-Jonathan_Harkers_Journal.txt', 'Dracula-Chapter-3-Jonathan_Harkers_Journal.txt']
1) file_name: Jonathan Harker's Journal
2) 
-----------

3 ) the_fancied_file_name_list: ['Dracul

## Identify unique string patterns to index and store markers for the beginning of the first chapter to end of the last chapter

In [5]:
# Extract the large target data slice

toc_end_pos = my_file_string.find("338") + len("338")   

start_here = my_file_string.find("DRACULA", toc_end_pos) + len("DRACULA")


end_here = my_file_string.find("THE END") + len("THE END")


my_chap_substring = my_file_string[start_here:end_here]  # large data slice extracted -- chapters isolated as 1 unit





### Exploit string properties and methods in Python along with patterns in the data to create target subsets of slice. Each chapter can now be indexed and accessed as it is stored as an element in a list.

In [6]:
my_chap_list = my_chap_substring.split("CHAPTER")  # split on the string "Chapter", each chapter is now a element in this list

print("position to start the chap split:", start_here) # start of Chapter 1
print("position to end the chap split:", end_here)  # end of Chapter 27

position to start the chap split: 3904
position to end the chap split: 846083


In [None]:
chap_list = []   # build a list that adds the string "CHAPTER" back in and indicates the Chapter number
chap_count = 0
for chapter in my_chap_list:

    if len(chapter) > len("CHAPTER"):   
        chap_list.append("CHAPTER" + chapter)
        chap_count += 1

print(chap_list)
print(len(my_chap_list))


for i in range(len(chap_list)):     
    fhandout = open(the_fancied_file_name_list[i], "w")  # create the outfile, use indexing to access
    print(chap_list[i], file=fhandout)  # write out data of each file (the chapters--target data units) to the corresponding files
    print(chap_list[i], "has been written to", the_fancied_file_name_list[i])
