In [8]:
import numpy as np
import pandas as pd
import random
import spacy
from tqdm.auto import tqdm
from spacy.tokens import DocBin
from spacy.util import minibatch
from spacy.training.example import Example
nlp = spacy.blank('en')

In [9]:
textcat = nlp.add_pipe('textcat')
textcat.add_label("1700")
textcat.add_label("1800")
textcat.add_label("1900")
textcat.add_label("2000")

1

In [10]:
df = pd.read_json('training_json_file.json')
df.head()
df['cleaned_html'].replace('', np.nan, inplace=True)
df.dropna(subset=['cleaned_html'], inplace=True)
df.head()

Unnamed: 0,cleaned_html,label
1,1 U.S. 20 1 Dall. 20 1 L.Ed. 19 ANONYMOUS. No....,1700
2,1 U.S. 18 1 Dall. 18 1 L.Ed. 18 The Lessee of ...,1700
3,1 U.S. 194 1 Dall. 194 1 L.Ed. 96 Gerardv.La C...,1700
4,1 U.S. 210 1 Dall. 210 1 L.Ed. 104 Pollardv.Sh...,1700
5,2 U.S. 97 2 Dall. 97 1 L.Ed. 305 Pringlev.Blac...,1700


In [11]:
train_texts = df['cleaned_html'].values
train_labels = [{"cats": {"1700": label == 1700,
                        "1800": label==1800,
                        "1900": label==1900,
                        "2000": label==2000}} for label in df['label']]

In [12]:
train_data = list(zip(train_texts, train_labels))

In [13]:
def train(model, train_data, optimizer, batch_size=8):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    
    # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
    for batch in minibatch(train_data, size=batch_size):
        # Split batch into text and labels
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            # TODO: Update model with texts and labels
            nlp.update([example], sgd=optimizer, losses = losses)
        
    return losses

In [14]:
spacy.util.fix_random_seed(1)
random.seed(1)

optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

141.216501016496


In [15]:
nlp.to_disk('saved_spacy_model')

In [27]:
import json

with open('./texts_for_testing/test_text.json', encoding="utf-8") as f:
    test_text = json.load(f)

In [19]:
opinion_of_text = test_text['plain_text']

In [22]:
spacy_doc = nlp(opinion_of_text)

In [24]:
spacy_doc.cats

{'1700': 2.4846047381288372e-05,
 '1800': 6.585433993677725e-07,
 '1900': 4.132466528972145e-06,
 '2000': 0.9999703168869019}

In [26]:
with open('./texts_for_testing/test_text_1.json', encoding='utf-8') as f:
    test_text_1 = json.load(f)

opinion_of_text_01 = test_text_1['plain_text']
spacy_doc_01 = nlp(opinion_of_text_01)
spacy_doc_01.cats

{'1700': 3.817029937636107e-05,
 '1800': 5.630177923876545e-08,
 '1900': 2.2707074549543904e-06,
 '2000': 0.9999594688415527}