This notebook takes a directory and returns part of speech tags for every file in the directory, using Spacy. It assumes you already have Spacy.

In [1]:
import os,spacy as sp

In [2]:
nlp = sp.load("en_core_web_sm")

In [9]:
# Choose your directory here
sdir = "/Users/jd/Documents/DH/Translation/2019-2020/IndividualStories"
# And name your output file here
ofn = "/Users/jd/Documents/DH/Translation/2019-2020/IndividualStories_pos_tags.tsv"

In [35]:
# Lists the files in a directory
def dir2filelist(directory,path=False):
    if path == False:
        files = os.listdir(directory)
    else:
        files = os.listdir(directory)
        files = [os.path.join(directory,f) for f in files]
    return files

# Culls the files in a directory to those of a particular type (default is .txt)
def dir2files(directory,path=False,type = ".txt"):
    files = dir2filelist(directory,path)
    for i in files[:]:
        if not i.endswith(type):
            files.remove(i)
    return files

# Turns a .txt file in a text string
def file2text(filename):
    f=open(filename,encoding="utf-8")
    text=f.read()
    f.close()
    return text

# Cleans a word in various ways
def cleanword(w,lower=True):
    # Usually want to make lowercase, but might not with names
    if lower: 
        w=w.lower()
    while w and not w[0].isalpha():
        w=w[1:]
    while w and not w[-1].isalpha():
        w=w[:-1]
    # all of this is if you want to get rid of apostrophe s's
#     if w.endswith("'s"):
#         w=w[:-2]
#         w=w+"s"
    return w

# Turns a string into a list of words
def text2words(sometext,clean=True):
    words = sometext.split()
    if clean:
        words = [cleanword(w) for w in words]
    return words

# Turns a .txt file into a list of words
def file2words(filename,clean=True):
    ftw_text = file2text(filename)
    ftw_words = text2words(ftw_text,clean=clean)
    return ftw_words

# Easy way to turn a long filepath into just the filename
def get_shortname(somepath):
    sn = os.path.basename(os.path.normpath(somepath))
    return sn

# Simple way to write a list of lists out as a spreadsheet
# Has a few silly hacks to help Excel understand the sheet
def lol2sheet(somelol,outputname,sep="\t"):
    with open(outputname,'w') as output:
        for row in somelol:
            row = [str(i) for i in row]
            for n,i in enumerate(row):
                if "\n" in i:
                    row[n] = "[return character(s)]"
                if "\t" in i:
                    row[n] = "[tab character(s)]"
                if "," in i:
                    row[n] = "[comma]"
            ostr = sep.join(row) + "\n"
            output.write(ostr)
    print("Wrote the file " + outputname)

In [22]:
# POS tags everything, turning the results into a list of lists
# If you have a lot of files, this might take a while
files = dir2files(sdir,path=True)
all_pos = []
for f in files:
    text = file2text(f)
    doc = nlp(text)
    for token in doc:
        row = [get_shortname(f),token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
        all_pos.append(row)

In [36]:
# Writes the list of lists as a .tsv file
lol2sheet(all_pos,ofn)

Wrote the file /Users/jd/Documents/DH/Translation/2019-2020/IndividualStories_pos_tags.tsv
