In [1]:
# import packages
import os
from glob import glob
import pandas as pd
import numpy as np
import re
import nltk

## MODIFY THIS
# get path to your folder that holds the txt files
source_files = "C:/Users/jacqu/Downloads/Court Case PDFs/Court Case TXTs"
# outputs a list of all the txt files in the folder
source_file_list = sorted(glob(f"{source_files}/*.txt"))

# creates a list of tuples with an elememt for the source path and
# for the file title
file_data = []
for source_file_path in source_file_list:
    # split might be different, recommend checking with INFO.sample() or .head()
    file_title = source_file_path.split('\\')[-1].split(".txt")[0]
    file_data.append((source_file_path, file_title))

# creating df with the file title as the index and source path as a col
INFO = pd.DataFrame(file_data, columns=['txt_path','file_title'])\
    .set_index('file_title').sort_index()
# attempt at dropping any duplicate files with same file name
# this only works if same file has the SAME NAME
# See Notes below
INFO = INFO[~INFO.index.duplicated(keep='first')]

**Notes** I envision the drop dups code piece to be helpful in the case that a file is downloaded on two different environments (hence, they will have the same name) and then the code is run. This will NOT take care of dups that have DIFFERENT file names...

In [2]:
INFO.sample(10)

Unnamed: 0_level_0,txt_path
file_title,Unnamed: 1_level_1
"State v. Wanninger, 2023 Iowa App. LEXIS 945",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"Curiam, 2023 Fla. LEXIS 1801",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"People v. Mahjoob, 2022 Cal. App. Unpub. LEXIS 2073",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Raniere, 55 F.4th 354",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"D.B. v. IE Hotel Grp., LLC, 2023 U.S. Dist. LEXIS 17945",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"Solis v. Okeechobee Shooting Sports, LLC, 2023 U.S. Dist. LEXIS 216887",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Streeter, 2023 U.S. App. LEXIS 32220",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"In re A.F., 2023-Ohio-4423",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"Samsung Fire _ Marine Ins. Co., Ltd. v. UFVS Mgmt. Co., LLC, 2023 U.S. Dist. LEXIS 46508",C:/Users/jacqu/Downloads/Court Case PDFs/Court...
"United States v. Smith, 2023 U.S. Dist. LEXIS 217838",C:/Users/jacqu/Downloads/Court Case PDFs/Court...


In [3]:
INFO.size

77

In [4]:
# opening the file in read mode 
my_file = open(INFO.txt_path[0], "r") 
# reading the file 
narrative = my_file.read() 

In [5]:
SENTS = nltk.sent_tokenize(narrative)
SENTS[10:20]

['On June 13, 2023,  Defendant Summit Hotel TRS 085, LLC ("Summit  Hotel") filed a motion to dismiss Plaintiff\'s complaint in  its entirety with [*2]  prejudice for failure to state a claim.',
 'ECF 17 at 1.',
 "In the alternative, Defendant moves to  strike portions of Plaintiff's complaint or the complaint in  its entirety.",
 'Id.',
 'at 1-2.',
 'Plaintiff filed a Response in  Opposition on June 27, 2023, ECF 22, and Defendant  filed a Reply on July 11, 2023, ECF 25.',
 'On August 10,  2023, Plaintiff filed a Notice of Supplemental Authority.',
 'ECF 29.',
 "Before this Court is Defendant Summit Hotel's Motion to  Dismiss.",
 'ECF 17.']

In [6]:
df = pd.DataFrame()
df['sent_str'] = nltk.sent_tokenize(narrative)
df.sent_str = df.sent_str.str.strip()
df.index.name = "sent_num"
df.sample(10)

Unnamed: 0_level_0,sent_str
sent_num,Unnamed: 1_level_1
274,at Â¶ 93.
257,2.
285,3d at 938.
287,This Court finds that Plaintiff has failed to...
87,"While Plaintiff was with ""buyers,"" Plaintiff'..."
226,"Plaintiff alleges that the ""the Residence Inn ..."
103,Defendant's Alleged Knowledge of Sex Traffick...
15,Plaintiff filed a Response in Opposition on J...
80,Id.
280,See 18 U.S.C.


In [7]:
# df.sent_str.apply(lambda x: x.split())

df1 = df.sent_str\
.apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)), dtype='object'))\
.stack().to_frame("token_pos")
df1.index.names = ["sent_num", "token_num"]
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,token_pos
sent_num,token_num,Unnamed: 2_level_1
0,0,"(OPINION, NN)"
0,1,"(AND, CC)"
0,2,"(ORDER, NNP)"
0,3,"(GRANTING, NNP)"
0,4,"(DEFENDANT, NNP)"


In [8]:
#df['token_str'] = 
df1['token_str'] = df1.token_pos.apply(lambda x: x[0].strip())
df1['term_str'] = df1.token_pos.apply(lambda x: x[0].lower().strip())
df1['pos_tag'] = df1.token_pos.apply(lambda x: x[1])
df1 = df1.drop(columns="token_pos")
df1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,token_str,term_str,pos_tag
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,OPINION,opinion,NN
0,1,AND,and,CC
0,2,ORDER,order,NNP
0,3,GRANTING,granting,NNP
0,4,DEFENDANT,defendant,NNP


In [9]:
ngram_order = 3 # 3 means trigrams
pads = ["<s>", "</s>"]
widx = [f"w{i}" for i in range(ngram_order)]
ohco = ['sent_num', 'token_num']

In [10]:
from nltk.lm import MLE
from nltk.lm import Vocabulary
from nltk.lm import NgramCounter
from nltk.lm.preprocessing import padded_everygram_pipeline
from collections import Counter

In [29]:
ngram_args = dict(pad_right=True, pad_left=True, left_pad_symbol=pads[0], right_pad_symbol=pads[1])
train_ngrams = [[] for i in range(ngram_order)]
for j in range(ngram_order):
    train_ngrams[j] = [nltk.ngrams(sent, n=j+1, **ngram_args) for sent in df.sent_str]

In [30]:
len(train_ngrams[0]), len(train_ngrams[1]), len(train_ngrams[2])

(290, 290, 290)

In [34]:
enumerate(train_ngrams[0])

<enumerate at 0x2373b0a7ec0>

In [13]:
ng_cols = ['token_num', 'word_pos', 'token']
ng_data = [[] for n in range(ngram_order)]
ng_df = [None for n in range(ngram_order)]

for n in range(ngram_order):
    for i, z in enumerate(train_ngrams[n]):
        # print(i, z)
        for j, x in enumerate(z):
            print(j, x)
            for k, token in enumerate(list(x)):
                # print(k, token)
                ng_data[n].append((i, j, f"w{k}", token))
                
    ng_df[n] = pd.DataFrame(ng_data[n], columns=ng_cols).set_index(ng_cols[:-1]).unstack()
    ng_df[n].columns = ng_df[n].columns.droplevel(0)

0 ('<s>', 'O')
1 ('O', 'P')
2 ('P', 'I')
3 ('I', 'N')
4 ('N', 'I')
5 ('I', 'O')
6 ('O', 'N')
7 ('N', ' ')
8 (' ', 'A')
9 ('A', 'N')
10 ('N', 'D')
11 ('D', ' ')
12 (' ', 'O')
13 ('O', 'R')
14 ('R', 'D')
15 ('D', 'E')
16 ('E', 'R')
17 ('R', ' ')
18 (' ', 'G')
19 ('G', 'R')
20 ('R', 'A')
21 ('A', 'N')
22 ('N', 'T')
23 ('T', 'I')
24 ('I', 'N')
25 ('N', 'G')
26 ('G', ' ')
27 (' ', 'D')
28 ('D', 'E')
29 ('E', 'F')
30 ('F', 'E')
31 ('E', 'N')
32 ('N', 'D')
33 ('D', 'A')
34 ('A', 'N')
35 ('N', 'T')
36 ('T', ' ')
37 (' ', ' ')
38 (' ', 'S')
39 ('S', 'U')
40 ('U', 'M')
41 ('M', 'M')
42 ('M', 'I')
43 ('I', 'T')
44 ('T', ' ')
45 (' ', 'H')
46 ('H', 'O')
47 ('O', 'T')
48 ('T', 'E')
49 ('E', 'L')
50 ('L', ' ')
51 (' ', 'T')
52 ('T', 'R')
53 ('R', 'S')
54 ('S', ' ')
55 (' ', '0')
56 ('0', '8')
57 ('8', '5')
58 ('5', ',')
59 (',', ' ')
60 (' ', 'L')
61 ('L', 'L')
62 ('L', 'C')
63 ('C', "'")
64 ("'", 'S')
65 ('S', ' ')
66 (' ', 'M')
67 ('M', 'O')
68 ('O', 'T')
69 ('T', 'I')
70 ('I', 'O')
71 ('O', 'N')


ValueError: 3 columns passed, passed data had 4 columns