<h1> Create TSV-DRO </h1>

A simple Notebook to encapsulate a text file as TSV-DRO

James E. Dobson (james.e.dobson@dartmouth.edu)<br>
Dartmouth College<br>
http://www.dartmouth.edu/~jed <br>



In [1]:
# load required modules
from tsvdro import tsvdro
import csv
import nltk
import os

In [2]:
def preprocess(text_object, options = "default"):
        from nltk.corpus import stopwords
        stopwords = stopwords.words('english')

        # *step one* (default): drop to lowercase
        pp_text = [word.lower() for word in text_object]

        # *step two* (default): remove non-alpha characters,
        # punctuation, and as many other "noise" elements as
        # possible. If dealing with a single character word,
        # drop non-alphabetical characters. This will remove
        # most punctuation but preserve many words containing
        # marks such as the '-' in 'self-emancipated'

        tmp_text=list()
        for word in pp_text:
                if len(word) == 1:
                        if word.isalpha == True:
                                tmp_text.append(word)
                else:
                        tmp_text.append(word)

        pp_text = tmp_text
        tmp_text=list()

        # now remove leading and trailing quotation marks,
        # hyphens and  dashes

        drop_list = [u'“'.encode('utf-8'),'"',u'”'.encode('utf-8'),'-','—']
        for word in pp_text:
                if word[0].encode('utf-8') in drop_list:
                        word = word[1:]
                if word[-1:].encode('utf-8') in drop_list:
                        word = word[:-1]
                # catch any zero-length words remaining
                if len(word) > 0:
                        tmp_text.append(word)

        pp_text = tmp_text

        # preprocessing function: preserve *ONLY* NLTK stopwords
        if options == "onlystop":
                pp_text = [word for word in pp_text if word in stopwords]
                return(pp_text)

        # *step three* (default): remove stopwords
        # enable an option for preserving stopwords
        if options != "nostop":
                # add additional stopwords, also containing some remainders from
                # tokenizing
                custom_stopwords="""like go going gone one said says would got still really get 's 'll n't"""
                stopwords += custom_stopwords.split()
                
                pp_text = [word for word in pp_text if word not in stopwords]

        return(pp_text)

In [3]:
# Describe this object to the best of your ability

file_name = 'New_England_Girlhood.txt'
author_name = 'Lucy Larcom'
title = 'A New England Girlhood'
publisher = ''
publisher_location = ''
publication_date = ''

In [4]:
tsvdro_object = dict()
tsvdro_object['header'] = tsvdro.build_header()
tsvdro_object['header']['bibliographic_data']['file_uri'] = ''
tsvdro_object['header']['bibliographic_data']['author_name'] = author_name
tsvdro_object['header']['bibliographic_data']['title'] = title

# produce TSV token count
raw_text = open(file_name,encoding="utf-8").read()
tokens = nltk.word_tokenize(raw_text)
text = nltk.Text(tokens)

# update raw token count
tsvdro_object['header']['workflow']['token_count']  = len(text)
text = preprocess(text)

# update vocab count
tsvdro_object['header']['workflow']['vocab_count'] = len(set(text))

# now build TSV
tsvdro_object['data'] = dict()

for token in set(text):
    tsvdro_object['data'][token] = text.count(token)

output = file_name.replace(".txt",".dro")
tsvdro.save(tsvdro_object,output)

In [5]:
tsvdro_object['header']

{'bibliographic_data': {'author_name': 'Lucy Larcom',
  'file_uri': '',
  'pages': None,
  'publication_date': None,
  'publisher': None,
  'publisher_location': None,
  'title': 'A New England Girlhood',
  'volumes': None},
 'tsvdro_ver': '1.0',
 'workflow': {'created_by': 'tsvdro_reference_implementation',
  'created_date': '2018-08-04 15:43',
  'created_system': 'parergon.local',
  'data_option': None,
  'data_type': 1,
  'last_updated': '2018-08-04 15:44',
  'token_count': 73704,
  'vocab_count': 7400}}