# Lab work № 7

In [1]:
import spacy
from spacy.training import Example

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Add a new entity type "MOVIE"
ner = nlp.get_pipe("ner")  # Get the Named Entity Recognition (NER) component
ner.add_label("MOVIE")  # Add the new label to the NER component

# Prepare training data with examples
train_data = [
    ("I watched The Shawshank Redemption yesterday.", {"entities": [(10, 32, "MOVIE")]}),
    ("Have you seen The Godfather trilogy?", {"entities": [(12, 28, "MOVIE")]}),
    ("Avatar is one of the highest-grossing films of all time.", {"entities": [(0, 6, "MOVIE")]}),
    ("The Hobbit series is based on J.R.R. Tolkien's novels.", {"entities": [(4, 26, "MOVIE")]})
]

# Train the model
epochs = 20
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']  # Exclude other pipes
with nlp.disable_pipes(*other_pipes):  # Disable other components to avoid unintended effects
    optimizer = nlp.create_optimizer()
    for i in range(epochs):
        for text, annotations in train_data:
            example = Example.from_dict(nlp.make_doc(text), annotations)  # Create an example
            nlp.update([example], sgd=optimizer)  # Update the model with the example

# Save the trained model
nlp.to_disk("updated_model")




In [2]:
import spacy

# Load the saved model
nlp_updated = spacy.load("updated_model")

# Example texts for entity recognition
texts = [
    "I watched Hobbit and The Godfather movies yesterday.",
    "Avatar and The Godfather are both great movies.",
    
]

# Process the text and print the results
for text in texts:
    doc = nlp_updated(text)
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Type: {ent.label_}")


Entity: Hobbit and The Godfather movies yesterday., Type: MOVIE
Entity: Avatar, Type: MOVIE


In [3]:
import json
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random

with open("movies.json", "r") as f:
    data = json.load(f)

# Initialize spaCy model
nlp = spacy.blank("en")

# Add TextCategorizer
textcat = nlp.add_pipe("textcat")

# Add labels (intents)
textcat.add_label("FindMovies")
textcat.add_label("AnotherTopic")

# Prepare training data from the data in the movies.json file
train_data = []
for dialogue in data:
    for turn in dialogue.get("turns", []):
        utterance = turn.get("utterance", "")
        try:
            intent = turn["frames"][0]["state"].get("active_intent", "")
        except:
            continue
        if utterance and intent:
            cats = {"FindMovies": 1 if intent == "FindMovies" else 0, "AnotherTopic": 1 if intent != "FindMovies" else 0}
            train_data.append((utterance, {"cats": cats}))

# Convert training data to Example objects
train_examples = []
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    train_examples.append(example)

# Train TextCategorizer
nlp.begin_training()
for epoch in range(20):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, losses=losses)
    print("Epoch:", epoch, "Loss:", losses)


Epoch: 0 Loss: {'textcat': 16.61813126443656}
Epoch: 1 Loss: {'textcat': 1.45699253639835}
Epoch: 2 Loss: {'textcat': 0.42873207534197016}
Epoch: 3 Loss: {'textcat': 0.25800743327148834}
Epoch: 4 Loss: {'textcat': 0.44003462200823823}
Epoch: 5 Loss: {'textcat': 0.22017732132993317}
Epoch: 6 Loss: {'textcat': 0.14170518438258217}
Epoch: 7 Loss: {'textcat': 6.334935298198521e-07}
Epoch: 8 Loss: {'textcat': 5.500647026242295e-08}
Epoch: 9 Loss: {'textcat': 3.1111652963672695e-08}
Epoch: 10 Loss: {'textcat': 2.0023922573652193e-08}
Epoch: 11 Loss: {'textcat': 1.3859434004146231e-08}
Epoch: 12 Loss: {'textcat': 1.1497401733235777e-08}
Epoch: 13 Loss: {'textcat': 7.314900104227051e-09}
Epoch: 14 Loss: {'textcat': 5.476195898313521e-09}
Epoch: 15 Loss: {'textcat': 4.303139062347723e-09}
Epoch: 16 Loss: {'textcat': 3.3619002888679955e-09}
Epoch: 17 Loss: {'textcat': 2.633154477905171e-09}
Epoch: 18 Loss: {'textcat': 2.089925247234445e-09}
Epoch: 19 Loss: {'textcat': 1.6386017549212253e-09}


In [4]:
# Тестування моделі на тестових даних
test_cases = [
    "Can you recommend a good comedy movie?",
    "I'm looking for a movie theater near the city center.",
    "What's playing at the cinema downtown?",
    "I need suggestions for family-friendly movies.",
    "Is there a theater showing the Romeo and Juliet?",
    "Where can I watch a romantic movie tonight?",
    "Let's do the homework",
    "I wanna read this book",
]

for test_case in test_cases:
    doc = nlp(test_case)
    predicted_intents = doc.cats
    print("Test case:", test_case)
    print("Predicted intents:", predicted_intents)

Test case: Can you recommend a good comedy movie?
Predicted intents: {'FindMovies': 0.9999258518218994, 'AnotherTopic': 7.416699372697622e-05}
Test case: I'm looking for a movie theater near the city center.
Predicted intents: {'FindMovies': 0.9999977350234985, 'AnotherTopic': 2.2651427116215928e-06}
Test case: What's playing at the cinema downtown?
Predicted intents: {'FindMovies': 0.012687030248343945, 'AnotherTopic': 0.9873129725456238}
Test case: I need suggestions for family-friendly movies.
Predicted intents: {'FindMovies': 0.9999991655349731, 'AnotherTopic': 8.017313462005404e-07}
Test case: Is there a theater showing the Romeo and Juliet?
Predicted intents: {'FindMovies': 0.0108707956969738, 'AnotherTopic': 0.9891291856765747}
Test case: Where can I watch a romantic movie tonight?
Predicted intents: {'FindMovies': 0.9999994039535522, 'AnotherTopic': 5.513458063433063e-07}
Test case: Let's do the homework
Predicted intents: {'FindMovies': 1.9524395611369982e-05, 'AnotherTopic': 