# Process raw text files

The code in this notebook processes raw textfiles. The first section converts a set of documents into a bag of sentences file -- i.e. M documents to one file with one sentence per line. The second section is optional.

In [2]:
import glob, os
import subprocess
import numpy as np

from bs4 import BeautifulSoup

from nltk.tokenize import sent_tokenize, word_tokenize
import string

import sys, os
sys.path.append(os.getcwd() + '/code/')
from courtlistener import json_to_dict

# you need to modify these!
op_dir = '/Users/iaincarmichael/data/word_embed/scotus/opinions/' # where to read the opinion file from
sentence_dir = '/Users/iaincarmichael/data/word_embed/scotus/sentences/' # where to write the sentence file

In [4]:

# # select a subset of the text files to process 
# # this makes things go faster -- comment out if you want to process all the text files
json_files = glob.glob(op_dir + "*.json")
json_files = np.random.choice(json_files, size=5000)

# convert files to bag of sentences

takes all SCOTUS text files, does some light preprocessing then writes them to a single file with one sentence on each line

- lower case words
- tokenize into sentences using nltk
- remove \n characters (new lines)
- remove punctuation
- remove sentences fewer than 5 characters

In [5]:
%%time

# use to remove punctuation from text
kill_punct = dict((ord(char), None) for char in string.punctuation)

with open(sentence_dir + 'scotus_sentences_5000.txt','ab') as write_file:
    for f in json_files:

        # read json file, parse html and get the text
        text = BeautifulSoup(json_to_dict(f)['html_with_citations'], 'lxml').get_text()

        # lowercase text
        text = text.lower()

        # tokenize text into sentences
        sentences = sent_tokenize(text)

        # remove \n characters
        # remove sentences with fewer than 5 character
        # remove punctuation
        sentences = [s.strip('\n').translate(kill_punct) for s in sentences if len(s) >= 5]

        # write file with one sentence on each line
        for s in sentences:
            write_file.write(s.encode('utf-8'))
            write_file.write('\n')

CPU times: user 1min 4s, sys: 548 ms, total: 1min 5s
Wall time: 1min 5s


# clean files (optional)

This section (optional) processes each document and creates a new, processed document i.e. M documents to M documents. This could be useful for making bag of words/TF-IDF matrix or for window contexts

- lower case all words
- remove punctuation
- some light preprocessing with word_tokenize()
- write a new file with cleaned text

In [None]:
processed_text_dir = '/Users/iaincarmichael/data/word_embed/scotus/processed_text_files/'

# code is commented so you don't accidently run something that will eat a lot of time/memory!

# for tf in raw_text_files:
#     # read in document from raw text file
#     text = open(tf).read().decode("utf8")

#     # lowercase text
#     text = text.lower()

#     # remove punctuation
#     text = text.translate(kill_punct)

#     # this removes a lot of annoying stuff
#     words = word_tokenize(text)
#     text = ' '.join(words)

#     op_id = tf.split('/')[-1].split('.txt')[0]
#     with open(processed_text_dir + '%s.txt' % op_id,'ab') as wf:
#         wf.write(text.encode('utf-8'))