In [1]:
import numpy as np
import pandas as pd
import random
import spacy
from tqdm.auto import tqdm
from spacy.tokens import DocBin
nlp = spacy.load('en_core_web_lg', disable=['tagger', 'parser', 'attribute_ruler' 'ner', 'lemmatizer'])
from spacy.util import minibatch



In [2]:
print(nlp.pipe_names)
nlp.add_pipe('textcat')

['tok2vec', 'attribute_ruler', 'ner']


<spacy.pipeline.textcat.TextCategorizer at 0x1f61e6faca0>

In [3]:
textcat = nlp.get_pipe('textcat')
textcat.add_label("1700")
textcat.add_label("1800")
textcat.add_label("1900")
textcat.add_label("2000")

1

In [4]:
textcat.labels

('1700', '1800', '1900', '2000')

In [5]:
df = pd.read_json('training_json_file.json')
df.head()
df['cleaned_html'].replace('', np.nan, inplace=True)
df.dropna(subset=['cleaned_html'], inplace=True)
df.head()

Unnamed: 0,cleaned_html,label
1,1 U.S. 20 1 Dall. 20 1 L.Ed. 19 ANONYMOUS. No....,1700
2,1 U.S. 18 1 Dall. 18 1 L.Ed. 18 The Lessee of ...,1700
3,1 U.S. 194 1 Dall. 194 1 L.Ed. 96 Gerardv.La C...,1700
4,1 U.S. 210 1 Dall. 210 1 L.Ed. 104 Pollardv.Sh...,1700
5,2 U.S. 97 2 Dall. 97 1 L.Ed. 305 Pringlev.Blac...,1700


In [6]:
train_texts = df['cleaned_html'].values
train_labels = [{"cats": {"1700": label == 1700,
                        "1800": label==1800,
                        "1900": label==1900,
                        "2000": label==2000}} for label in df['label']]

In [7]:
train_data = list(zip(train_texts, train_labels))
train_data[0]

('1 U.S. 20 1 Dall. 20 1 L.Ed. 19 ANONYMOUS. No. ____. Supreme Court of Pennsylvania September Term, 1773.  John Fisher, having two sons and a Daughter, made his will, and devised a plantation to his son Matthias in fee. Matthias dies in his minority, intestate, and without issue. Question: whether his heirs at common law shall take; or it shall divide among his other brothers and sisters, under the supplemental intestate law of this Province? On a trial in ejectment for the plantation, it was agreed by council, that the opinion of the Court should be conclusive to the Jury. Mr. Justice WILLING and Justice LAWRENCE were of opinion, and so delivered it to the Jury, that the estate should be divided: and the plaintiff suffered a nonsuit.*    *  Determined at Bucks, Ni, Pri. 15 Oct. 1773.   ',
 {'cats': {'1700': True, '1800': False, '1900': False, '2000': False}})

In [8]:
from spacy.training import Example

def train(model, train_data, optimizer, batch_size=8):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)

    for batch in minibatch(train_data, size=batch_size):
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            # Update model with texts and labels
            model.update([example], sgd=optimizer, losses=losses)

    return losses

In [9]:
optimizer = nlp.create_optimizer()

# This may take a while to run!
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset

In [None]:
spacy.util.fix_random_seed(1)
from spacy.training import Example

optimizer = nlp.create_optimizer()

for batch in spacy.util.minibatch(train_data, size=8):
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd=optimizer)

ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset

In [None]:
def make_docs(data):

    docs = []
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total=len(data)):
        doc.cats['positive]'] = label
        docs.append(doc)
    return(docs)

In [None]:
df = pd.read_json('training_json_file.json')

In [None]:
df.head()

Unnamed: 0,cleaned_html,label
0,,1700
1,1 U.S. 20 1 Dall. 20 1 L.Ed. 19 ANONYMOUS. No....,1700
2,1 U.S. 18 1 Dall. 18 1 L.Ed. 18 The Lessee of ...,1700
3,1 U.S. 194 1 Dall. 194 1 L.Ed. 96 Gerardv.La C...,1700
4,1 U.S. 210 1 Dall. 210 1 L.Ed. 104 Pollardv.Sh...,1700


In [None]:
data = list(zip(df['cleaned_html'], df['label']))
print(len(data))

63374


In [None]:
num_texts = 500
train_data = random.sample(data, num_texts)
train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk('train.spacy')

valid_data = random.sample(data, num_texts)
valid_docs = make_docs(valid_data)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk('valid.spacy')

  0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
y = pd.get_dummies(df['label'], prefix='label')
label = list(y.columns)
print(label)

y = y.to_dict('index')

['label_1700', 'label_1800', 'label_1900', 'label_2000']


In [None]:
dataset = list(zip(df['cleaned_html'], [{'cats': cats} for cats in y.values()]))
print(dataset[1])

('1 U.S. 20 1 Dall. 20 1 L.Ed. 19 ANONYMOUS. No. ____. Supreme Court of Pennsylvania September Term, 1773.  John Fisher, having two sons and a Daughter, made his will, and devised a plantation to his son Matthias in fee. Matthias dies in his minority, intestate, and without issue. Question: whether his heirs at common law shall take; or it shall divide among his other brothers and sisters, under the supplemental intestate law of this Province? On a trial in ejectment for the plantation, it was agreed by council, that the opinion of the Court should be conclusive to the Jury. Mr. Justice WILLING and Justice LAWRENCE were of opinion, and so delivered it to the Jury, that the estate should be divided: and the plaintiff suffered a nonsuit.*    *  Determined at Bucks, Ni, Pri. 15 Oct. 1773.   ', {'cats': {'label_1700': 1, 'label_1800': 0, 'label_1900': 0, 'label_2000': 0}})


In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(dataset, train_size=0.8, random_state=13)

textcat = nlp.add_pipe("textcat_multilabel")
for i in label:
    textcat.add_label(i)


In [None]:
textcat.labels

('label_1700', 'label_1800', 'label_1900', 'label_2000')

In [None]:
nlp = spacy.blank('en')
optimizer = nlp.begin_training()
iterations = 2

In [None]:
from spacy.util import minibatch, compounding
from spacy.training import Example
with nlp.select_pipes(enable="textcat_multilabel"):
    for j in range(iterations):
        losses = {}
        k = 0
        batches = minibatch(train_data, size = compounding(4.,32.,1.001))
        for batch in batches:
            text, annotations = zip(*batch)
            example = []
            for i in range(len(text)):
                doc = nlp.make_doc(text[i])
                example.append(Example.from_dict(doc, annotations[i]))
            nlp.update(example, sgd=optimizer, drop=0.2, losses = losses)
            print(f'Batch No: {k} Loss = {losses}')
            k += 1
        print("\n\n Completed Iterations : {} ".format(j))

Batch No: 0 Loss = {}
Batch No: 1 Loss = {}
Batch No: 2 Loss = {}
Batch No: 3 Loss = {}
Batch No: 4 Loss = {}
Batch No: 5 Loss = {}
Batch No: 6 Loss = {}
Batch No: 7 Loss = {}
Batch No: 8 Loss = {}
Batch No: 9 Loss = {}
Batch No: 10 Loss = {}
Batch No: 11 Loss = {}
Batch No: 12 Loss = {}
Batch No: 13 Loss = {}
Batch No: 14 Loss = {}
Batch No: 15 Loss = {}
Batch No: 16 Loss = {}
Batch No: 17 Loss = {}
Batch No: 18 Loss = {}
Batch No: 19 Loss = {}
Batch No: 20 Loss = {}
Batch No: 21 Loss = {}
Batch No: 22 Loss = {}
Batch No: 23 Loss = {}
Batch No: 24 Loss = {}
Batch No: 25 Loss = {}
Batch No: 26 Loss = {}
Batch No: 27 Loss = {}
Batch No: 28 Loss = {}
Batch No: 29 Loss = {}
Batch No: 30 Loss = {}
Batch No: 31 Loss = {}
Batch No: 32 Loss = {}
Batch No: 33 Loss = {}
Batch No: 34 Loss = {}
Batch No: 35 Loss = {}
Batch No: 36 Loss = {}
Batch No: 37 Loss = {}
Batch No: 38 Loss = {}
Batch No: 39 Loss = {}
Batch No: 40 Loss = {}
Batch No: 41 Loss = {}
Batch No: 42 Loss = {}
Batch No: 43 Loss = {

KeyboardInterrupt: 