In [1]:
closed_questions_dev = [
    "Who is the director of 'Inception'?",
    "Who directed 'Pulp Fiction'?",
    "Tell me who directed 'The Shape of Water'?",
    "Who is the filmmaker of 'Get Out'?",
    "Which director took charge of 'The Grand Budapest Hotel'?",
    "Who's the person behind the direction of 'Moonlight'?",
    "Who helmed the direction of 'Mad Max: Fury Road'?",
    "Who is the screenwriter of 'Eternal Sunshine of the Spotless Mind'?",
    "Who wrote the screenplay for 'Her'?",
    "Which writer was responsible for 'The Social Network'?",
    "Who's credited as the screenwriter for 'Juno'?",
    "What is the MPAA film rating of 'Joker'?",
    "What's the censorship rating of 'The Dark Knight' by MPAA?",
    "What is the genre of 'Blade Runner 2049'?",
    "Which genre does 'Parasite' belong to?",
]

recommendation_questions_dev = [
    "Recommend movies similar to 'The Matrix'.",
    "What are some films similar to 'Interstellar'?",
    "Can you suggest movies that are like 'Arrival'?",
    "If I enjoyed 'Blade Runner 2049', what other movies would you suggest?",
    "Which films would you recommend if I loved 'The Lord of the Rings'?",
    "Given that I like 'The Godfather', 'Goodfellas', 'Casino', can you recommend some movies?",
    "Based on my liking for 'Fight Club', 'Se7en', 'Gone Girl', suggest some films.",
    "I enjoy watching 'Amelie', 'La La Land', and 'The Grand Budapest Hotel'. What else would you recommend?",
    "Recommend movies like 'Alien', 'Prometheus', and 'The Martian'",
    "Can you suggest films similar to 'Saving Private Ryan', 'Platoon', and 'Full Metal Jacket'?",
    "Based on my preference for 'Forrest Gump', 'The Shawshank Redemption', and 'The Green Mile', what else should I watch?",
    "I like 'The Silence of the Lambs', so recommend similar movies",
    "If I enjoyed 'The Shining', what other films would you suggest?",
    "Movies similar to 'Psycho' are what I'm looking for. Recommendations?",
]

when_questions_dev = [
    "When was 'Titanic' released?",
    "On which date was 'The Lion King' released?",
    "Can you tell me when 'Schindler's List' was released?",
    "Which day did 'Jurassic Park' hit the theaters?",
    "What is the release date of 'Avatar'?",
    "Tell me the release date of 'The Avengers'.",
    "When exactly was 'Harry Potter and the Philosopher's Stone' made available to the public?",
    "Which year was 'Star Wars: Episode IV - A New Hope' released?",
    "In what year did 'Back to the Future' come out?",
    "When did 'Indiana Jones and the Raiders of the Lost Ark' come out?",
    "On which day was 'The Empire Strikes Back' first shown?",
    "When was the premiere of 'The Godfather'?",
]

multimedia_questions_dev = [
    "Show me a picture of 'Leonardo DiCaprio'.",
    "Can you display a photo of 'Jennifer Lawrence'?",
    "I'd like to see a photograph of 'Brad Pitt'.",
    "Provide me with a picture of 'Meryl Streep'.",
    "What does 'Denzel Washington' look like?",
    "Can you describe 'Angelina Jolie's' appearance?",
    "Tell me about 'Tom Hanks's'physical appearance.",
    "Let me know what 'Charlize Theron' looks like.",
    "Give me an idea of 'Johnny Depp's' looks.",
    "Provide a description or image of 'Natalie Portman'.",
]



In [2]:
def convert_to_spacy_format(questions, entity_type):
    """
    Convert a list of questions to spaCy training data format.
    :param questions: A list of questions.
    :param entity_type: The type of entity to be extracted (e.g., 'MOVIE', 'ACTOR').
    :return: A list of tuples in spaCy training data format.
    """
    training_data = []
    for question in questions:
        # Find the start and end indices of the entity (movie or actor name)
        start_index = question.find("'")
        end_index = question.find("'", start_index + 1)
        # Remove the quotes from the question
        text = question.replace("'", "")
        if start_index != -1 and end_index != -1:
            # Adjust indices after removing quotes
            start_index -= question.count("'", 0, start_index)
            end_index -= question.count("'", 0, end_index)
            # Append to the training data
            training_data.append((text, {'entities': [(start_index, end_index, entity_type)]}))
        else:
            # No entity found
            training_data.append((text, {'entities': []}))

    return training_data



In [3]:
questions_movie = closed_questions_dev + recommendation_questions_dev + when_questions_dev
questions_actor = multimedia_questions_dev
training_data = convert_to_spacy_format(questions_movie, 'MOVIE')
training_data += convert_to_spacy_format(questions_actor, 'ACTOR')


In [4]:
training_data

[('Who is the director of Inception?', {'entities': [(23, 32, 'MOVIE')]}),
 ('Who directed Pulp Fiction?', {'entities': [(13, 25, 'MOVIE')]}),
 ('Tell me who directed The Shape of Water?',
  {'entities': [(21, 39, 'MOVIE')]}),
 ('Who is the filmmaker of Get Out?', {'entities': [(24, 31, 'MOVIE')]}),
 ('Which director took charge of The Grand Budapest Hotel?',
  {'entities': [(30, 54, 'MOVIE')]}),
 ('Whos the person behind the direction of Moonlight?',
  {'entities': [(3, 40, 'MOVIE')]}),
 ('Who helmed the direction of Mad Max: Fury Road?',
  {'entities': [(28, 46, 'MOVIE')]}),
 ('Who is the screenwriter of Eternal Sunshine of the Spotless Mind?',
  {'entities': [(27, 64, 'MOVIE')]}),
 ('Who wrote the screenplay for Her?', {'entities': [(29, 32, 'MOVIE')]}),
 ('Which writer was responsible for The Social Network?',
  {'entities': [(33, 51, 'MOVIE')]}),
 ('Whos credited as the screenwriter for Juno?',
  {'entities': [(3, 38, 'MOVIE')]}),
 ('What is the MPAA film rating of Joker?', {'enti

In [12]:
test_data = [
    ("Who starred in Titanic?", {'entities': [(16, 23, 'MOVIE')]}),
    ("What film did Leonardo DiCaprio act in?", {'entities': [(9, 26, 'ACTOR')]}),
    ("Can you name the director of The Shawshank Redemption?", {'entities': [(30, 53, 'MOVIE')]}),
    ("Which movie featured Brad Pitt?", {'entities': [(24, 33, 'ACTOR')]}),
    ("Who directed Avatar?", {'entities': [(13, 19, 'MOVIE')]}),
    ("Who played the lead in Inception?", {'entities': [(25, 34, 'MOVIE')]}),
    ("What's the latest film by Quentin Tarantino?", {'entities': [(27, 43, 'ACTOR')]}),
    ("Name a movie starring Natalie Portman.", {'entities': [(24, 39, 'ACTOR')]}),
    ("Who was the lead actor in Blade Runner 2049?", {'entities': [(30, 47, 'MOVIE')]}),
    ("Which film did Tom Hanks act in?", {'entities': [(13, 22, 'ACTOR')]}),
    ("What is the genre of Pulp Fiction?", {'entities': [(19, 31, 'MOVIE')]}),
    ("Who wrote the screenplay for The Matrix?", {'entities': [(32, 42, 'MOVIE')]}),
]


In [13]:
test_data

[('Who starred in Titanic?', {'entities': [(16, 23, 'MOVIE')]}),
 ('What film did Leonardo DiCaprio act in?', {'entities': [(9, 26, 'ACTOR')]}),
 ('Can you name the director of The Shawshank Redemption?',
  {'entities': [(30, 53, 'MOVIE')]}),
 ('Which movie featured Brad Pitt?', {'entities': [(24, 33, 'ACTOR')]}),
 ('Who directed Avatar?', {'entities': [(13, 19, 'MOVIE')]}),
 ('Who played the lead in Inception?', {'entities': [(25, 34, 'MOVIE')]}),
 ("What's the latest film by Quentin Tarantino?",
  {'entities': [(27, 43, 'ACTOR')]}),
 ('Name a movie starring Natalie Portman.', {'entities': [(24, 39, 'ACTOR')]}),
 ('Who was the lead actor in Blade Runner 2049?',
  {'entities': [(30, 47, 'MOVIE')]}),
 ('Which film did Tom Hanks act in?', {'entities': [(13, 22, 'ACTOR')]}),
 ('What is the genre of Pulp Fiction?', {'entities': [(19, 31, 'MOVIE')]}),
 ('Who wrote the screenplay for The Matrix?',
  {'entities': [(32, 42, 'MOVIE')]})]

In [24]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

# We will use the previously formatted TRAIN_DATA to create the .spacy file
# The nlp object is created as a blank English model
nlp = spacy.load("en_core_web_sm")

# The DocBin is used to store the documents
db = DocBin()


for text, annotations in training_data:
    # Create a spacy document from the text
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations['entities']:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

# Save the DocBin to a .spacy file
spacy_file_path = "./train.spacy"
db.to_disk(spacy_file_path)

spacy_file_path  # Returning the file path to the user

'./train.spacy'

In [41]:
nlp1 = spacy.load("output/model-last/")  # load the best model
# input sample text
doc = nlp1("Who is the director of Good Will Hunting?")

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [34]:
TEST_DATA = [
    ("Who starred in Titanic?", {'entities': [(16, 23, 'MOVIE')]}),
    ("What film did Leonardo DiCaprio act in?", {'entities': [(9, 26, 'ENTITY')]}),
    ("Can you name the director of The Shawshank Redemption?", {'entities': [(30, 53, 'MOVIE')]}),
    ("Which movie featured Brad Pitt?", {'entities': [(24, 33, 'ACTOR')]}),
    ("Who directed Avatar?", {'entities': [(13, 19, 'MOVIE')]}),
    ("Who played the lead in Inception?", {'entities': [(25, 34, 'MOVIE')]}),
    ("What's the latest film by Quentin Tarantino?", {'entities': [(27, 43, 'ACTOR')]}),
    ("Name a movie starring Natalie Portman.", {'entities': [(24, 39, 'ACTOR')]}),
    ("Who was the lead actor in Blade Runner 2049?", {'entities': [(30, 47, 'MOVIE')]}),
    ("Which film did Tom Hanks act in?", {'entities': [(13, 22, 'ACTOR')]}),
    ("What is the genre of Pulp Fiction?", {'entities': [(19, 31, 'MOVIE')]}),
    ("Who wrote the screenplay for The Matrix?", {'entities': [(32, 42, 'MOVIE')]}),
]

for text, _ in TEST_DATA:
    doc = nlp1(text)
    spacy.displacy.render(doc, style="ent", jupyter=True)