In [1]:
import numpy as np
import pandas as pd
import random
import spacy
from tqdm.auto import tqdm
from spacy.tokens import DocBin
from spacy.util import minibatch
from spacy.training.example import Example
nlp = spacy.blank('en')
from sklearn.model_selection import train_test_split



In [2]:
textcat = nlp.add_pipe('textcat')
textcat.add_label("1700")
textcat.add_label("1800")
textcat.add_label("1900")
textcat.add_label("2000")

1

In [23]:
df = pd.read_json('training_json_file.json')
df.head()
df['cleaned_html'].replace('', np.nan, inplace=True)
df.dropna(subset=['cleaned_html'], inplace=True)
print(df.head())
print(df.shape)


                                        cleaned_html  label
1  1 U.S. 20 1 Dall. 20 1 L.Ed. 19 ANONYMOUS. No....   1700
2  1 U.S. 18 1 Dall. 18 1 L.Ed. 18 The Lessee of ...   1700
3  1 U.S. 194 1 Dall. 194 1 L.Ed. 96 Gerardv.La C...   1700
4  1 U.S. 210 1 Dall. 210 1 L.Ed. 104 Pollardv.Sh...   1700
5  2 U.S. 97 2 Dall. 97 1 L.Ed. 305 Pringlev.Blac...   1700
(60055, 2)


In [25]:
df['tuples'] = df.apply(lambda row: (row['cleaned_html'], row['label']), axis=1)
train = df['tuples'].tolist()
train[:1]

[('1 U.S. 20 1 Dall. 20 1 L.Ed. 19 ANONYMOUS. No. ____. Supreme Court of Pennsylvania September Term, 1773.  John Fisher, having two sons and a Daughter, made his will, and devised a plantation to his son Matthias in fee. Matthias dies in his minority, intestate, and without issue. Question: whether his heirs at common law shall take; or it shall divide among his other brothers and sisters, under the supplemental intestate law of this Province? On a trial in ejectment for the plantation, it was agreed by council, that the opinion of the Court should be conclusive to the Jury. Mr. Justice WILLING and Justice LAWRENCE were of opinion, and so delivered it to the Jury, that the estate should be divided: and the plaintiff suffered a nonsuit.*    *  Determined at Bucks, Ni, Pri. 15 Oct. 1773.   ',
  1700)]

In [30]:
def load_data(limit=0, split=0.8):
    train_data=train
    # Shuffle the data
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    # get the categories for each review
    cats = [{"cats": {"1700": label == 1700,
                        "1800": label==1800,
                        "1900": label==1900,
                        "2000": label==2000}} for label in labels]

    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

n_texts=60055

# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:1]

  {'cats': {'cats': {'1700': False,
     '1800': False,
     '1900': True,
     '2000': False}}})]

In [31]:
def train(model, train_data, optimizer, batch_size=8):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
    for batch in minibatch(train_data, size=batch_size):
        # Split batch into text and labels
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            # TODO: Update model with texts and labels
            nlp.update([example], sgd=optimizer, losses = losses)
        
    return losses

In [32]:
spacy.util.fix_random_seed(1)
random.seed(1)

optimizer = nlp.begin_training()
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

In [None]:
nlp.to_disk('saved_spacy_model_01')

In [None]:
import json

with open('./texts_for_testing/test_text.json', encoding="utf-8") as f:
    test_text = json.load(f)

In [None]:
opinion_of_text = test_text['plain_text']

In [None]:
spacy_doc = nlp(opinion_of_text)

In [None]:
spacy_doc.cats

{'1700': 0.2703966498374939,
 '1800': 0.2432011365890503,
 '1900': 0.2432011365890503,
 '2000': 0.2432011365890503}

In [None]:
with open('./texts_for_testing/test_text_1.json', encoding='utf-8') as f:
    test_text_1 = json.load(f)

opinion_of_text_01 = test_text_1['plain_text']
spacy_doc_01 = nlp(opinion_of_text_01)
spacy_doc_01.cats

{'1700': 3.817029937636107e-05,
 '1800': 5.630177923876545e-08,
 '1900': 2.2707074549543904e-06,
 '2000': 0.9999594688415527}