Given a directory or a file, this program will find the type token ratio for a rolling window of n number of words (default is n=500) and output the results as a .tsv file

In [1]:
import os

In [8]:
# Directory or text file here
sdir = "/Users/jd/Documents/DH/ShortStories/Corpus"
fn = "/Users/jd/Documents/DH/ShortStories/Corpus/ss_Hemingway_CleanWellLightedPlace.txt"
# Name your output file here
ofn = "/Users/jd/Documents/DH/ShortStories/Corpus_rolling_ttr.tsv"

In [9]:
# Lists the files in a directory
def dir2filelist(directory,path=False):
    if path == False:
        files = os.listdir(directory)
    else:
        files = os.listdir(directory)
        files = [os.path.join(directory,f) for f in files]
    return files

# Culls the files in a directory to those of a particular type (default is .txt)
def dir2files(directory,path=False,type = ".txt"):
    files = dir2filelist(directory,path)
    for i in files[:]:
        if not i.endswith(type):
            files.remove(i)
    return files

# Turns a .txt file in a text string
def file2text(filename):
    f=open(filename,encoding="utf-8")
    text=f.read()
    f.close()
    return text

# Cleans a word in various ways
def cleanword(w,lower=True):
    # Usually want to make lowercase, but might not with names
    if lower: 
        w=w.lower()
    while w and not w[0].isalpha():
        w=w[1:]
    while w and not w[-1].isalpha():
        w=w[:-1]
    # all of this is if you want to get rid of apostrophe s's
#     if w.endswith("'s"):
#         w=w[:-2]
#         w=w+"s"
    return w

# Turns a string into a list of words
def text2words(sometext,clean=True):
    words = sometext.split()
    if clean:
        words = [cleanword(w) for w in words]
    return words

# Turns a .txt file into a list of words
def file2words(filename,clean=True):
    ftw_text = file2text(filename)
    ftw_words = text2words(ftw_text,clean=clean)
    return ftw_words

# Easy way to turn a long filepath into just the filename
def get_shortname(somepath):
    sn = os.path.basename(os.path.normpath(somepath))
    return sn

In [13]:
# Gets the type token ratio (ttr) of a list of words
def get_ttr(somewords):
    uniques = set(somewords)
    ttr = len(uniques) / len(somewords)
    return ttr

# Gets a list of all ttrs of a rolling window across a text
def rolling_ttr_list(somewords,window=500):
    ttrs = []
    wc = len(somewords)
    if wc < window:
        ttrs.append("text shorter than window")
    else:
        n=0
        while n + window < wc:
            subset = somewords[n:n+window]
            ttr = get_ttr(subset)
            ttrs.append(ttr)
            n = n+1
    return ttrs

# Gets the average ttr from a list of ttrs
def get_rolling_ttr(somewords,window=500):
    ttr_list = rolling_ttr_list(somewords,window=window)
    if len(ttr_list) < 2:
        ttr = "text shorter than window"
    else:
        ttr = sum(ttr_list) / len(ttr_list)
    return ttr

# Simple way to write a dictionary as a spreadsheet file
def dict2file(somedict,outputname,sep = "\t"):
    with open(outputname,'w') as output:
        for k in somedict:
            ol = [k,str(somedict[k])]
            ostr = sep.join(ol) + "\n"
            output.write(ostr)
    print("Wrote the file " + outputname)

In [11]:
# Makes a dictionary in which each text is associated with its rolling ttr
files = dir2files(sdir,path=True)
ttr_dict = {}
for f in files:
    words = file2words(f)
    shortname = get_shortname(f)
    ttr = get_rolling_ttr(words)
    ttr_dict[shortname] = ttr

In [14]:
# Writes the dictionary to a .tsv file
dict2file(ttr_dict,ofn)

Wrote the file /Users/jd/Documents/DH/ShortStories/Corpus/rolling_ttr.tsv
