In [50]:
import json
import spacy
from bs4 import BeautifulSoup
nlp = spacy.load("output/model-best")

In [51]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [52]:
with open('./texts_for_testing/1880_90030.json', encoding="utf-8") as f:
    test_text = json.load(f)
opinion_of_text = test_text['html_lawbox']
cleaned_text_opinion = strip_html_tags(opinion_of_text)
print(cleaned_text_opinion[:500])


100 U.S. 158 (____)
Dow
v.
JOHNSON.
Supreme Court of United States.

The case was argued by The Attorney-General and Mr. E.B. Smith, Assistant Attorney-General, for the plaintiff in error, and by Mr. Thomas J. Durant for the defendant in error.
MR. JUSTICE FIELD delivered the opinion of the court.
The defendant in the court below, the plaintiff in error here, Neal Dow, was a brigadier-general in the army of the United States during the late civil war, and in 1862 and 1863 was *159 stationed in 


In [53]:
from pathlib import Path
folder_location = './texts_for_testing/'
paths = list(Path(folder_location).rglob('*.json'))
filenames = [k.stem for k in paths]
filenames

['1764_84587',
 '1783_84599',
 '1944_103915',
 '1880_90030',
 '1850_86508',
 '1764_84586',
 '1985_111301',
 '2022_opinion_2',
 '1865_87621',
 '1902_95542',
 '1764_2381788',
 '1920_99495',
 '1898_94785',
 '1783_84600',
 '1963_106601',
 '2022_opinion_1',
 '1804_84713']

In [54]:
names_and_scores = dict()

for file in paths:
    with open(file, encoding="utf-8") as f:
        data = json.load(f)
        stem = file.stem
        #stem = stem.split('_')[0]
    if data['plain_text'] != "":
        spacy_doc = nlp(data['plain_text'])
        names_and_scores[stem] = spacy_doc.cats
    elif data['html_with_citations'] != "":
        cleaned_text_opinion = strip_html_tags(data['html_with_citations'])
        spacy_doc = nlp(cleaned_text_opinion)
        names_and_scores[stem] = spacy_doc.cats
    elif data['html_lawbox'] != "":
        cleaned_text_opinion = strip_html_tags(data['html_lawbox'])
        spacy_doc = nlp(cleaned_text_opinion)
        names_and_scores[stem] = spacy_doc.cats

In [55]:
print(len(names_and_scores))

17


In [56]:
print(names_and_scores)

#13 out of 17 classified correctly ...:
13/17


{'1764_84587': {'1700': 1.0, '1800': 2.8113275263308424e-08, '1900': 1.0198507593983663e-09, '2000': 1.780704965819123e-26}, '1783_84599': {'1700': 0.9365735054016113, '1800': 0.0031821851152926683, '1900': 0.05074755847454071, '2000': 0.009496694430708885}, '1944_103915': {'1700': 0.0, '1800': 0.0, '1900': 1.0, '2000': 0.0}, '1880_90030': {'1700': 0.0, '1800': 1.0, '1900': 0.0, '2000': 0.0}, '1850_86508': {'1700': 9.226048644040673e-37, '1800': 1.0, '1900': 3.8507681799645973e-41, '2000': 0.0}, '1764_84586': {'1700': 0.9997567534446716, '1800': 0.00011151911166962236, '1900': 6.334606587188318e-05, '2000': 6.829507037764415e-05}, '1985_111301': {'1700': 0.0, '1800': 0.0, '1900': 1.0, '2000': 0.0}, '2022_opinion_2': {'1700': 0.0, '1800': 0.0, '1900': 1.0, '2000': 0.0}, '1865_87621': {'1700': 3.907836543248777e-08, '1800': 0.9997335076332092, '1900': 0.000266456016106531, '2000': 4.145987221612772e-18}, '1902_95542': {'1700': 2.4049142682345812e-26, '1800': 2.078580655506812e-05, '1900'

0.7647058823529411

In [57]:
import pandas as pd

df = pd.DataFrame.from_dict(names_and_scores).T
df.head()

Unnamed: 0,1700,1800,1900,2000
1764_84587,1.0,0.0,0.0,0.0
1783_84599,0.93657,0.00318,0.05075,0.0095
1944_103915,0.0,0.0,1.0,0.0
1880_90030,0.0,1.0,0.0,0.0
1850_86508,0.0,1.0,0.0,0.0


In [58]:
df['highest_score'] = df.idxmax(axis=1)

In [59]:
df

Unnamed: 0,1700,1800,1900,2000,highest_score
1764_84587,1.0,0.0,0.0,0.0,1700
1783_84599,0.93657,0.00318,0.05075,0.0095,1700
1944_103915,0.0,0.0,1.0,0.0,1900
1880_90030,0.0,1.0,0.0,0.0,1800
1850_86508,0.0,1.0,0.0,0.0,1800
1764_84586,0.99976,0.00011,6e-05,7e-05,1700
1985_111301,0.0,0.0,1.0,0.0,1900
2022_opinion_2,0.0,0.0,1.0,0.0,1900
1865_87621,0.0,0.99973,0.00027,0.0,1800
1902_95542,0.0,2e-05,0.99998,0.0,1900


In [60]:
import dataframe_image as dfi
pd.set_option('display.float_format', lambda x: '%.5f' % x)

#dfi.export(df, 'test_table_png_file.png')