In [1]:
import rdflib
import re
import spacy
import random

In [2]:
graph = rdflib.Graph()
graph.parse("../../../data/14_graph.nt", format="turtle")

<Graph identifier=Nf7c4638fcb3a40e09e13d8d8da367bb4 (<class 'rdflib.graph.Graph'>)>

In [101]:
query = """
PREFIX ns1: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?filmName
WHERE {
  ?film ns1:P31 <http://www.wikidata.org/entity/Q11424> .
  ?film rdfs:label ?filmName .
  FILTER (LANG(?filmName) = "en")
}
LIMIT 10000
"""
  
movie_names = [str(s) for s, in graph.query(query)]


In [102]:
query = """
PREFIX ns1: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?actorName
WHERE {
  ?actor ns1:P106 <http://www.wikidata.org/entity/Q33999> .
  ?actor rdfs:label ?actorName .
}
LIMIT 10000
"""

actor_names = [str(s) for s, in graph.query(query)]


In [111]:
train_amount = int(0.8 * len(actor_names))
movie_names_train, movie_names_dev = movie_names[:train_amount], movie_names[train_amount:]
actor_names_train, actor_names_dev = actor_names[:train_amount], actor_names[train_amount:]

len(movie_names_dev)

2000

In [114]:
print(actor_names_train[:10])
print(movie_names_train[:10])

['Viktor Krištof', 'Oleg Kapanets', 'Alexander Geringas', 'Giovanni Korporaal', 'Jürgen Knieper', 'Mel Jarnson', 'Carmen Beato', 'Martina Cariddi', 'Atli Oskar Fjalarsson', 'Cyril Morin']
['Jan Dara', 'Moondram Pirai', "Buffalo Bill and the Indians, or Sitting Bull's History Lesson", 'What We Wanted', 'Wanted: Dead or Alive', 'Linger', 'Eastern Condors', 'Amerika', 'Bukowski: Born into This', 'Fatal Move']


In [87]:
closed_questions_dev = [
    "Who is the director of <Inception>?",
    "Who directed <Pulp Fiction>?",
    "Tell me who directed <The Shape of Water>?",
    "Who is the filmmaker of <Get Out>?",
    "Which director took charge of <The Grand Budapest Hotel>?",
    "Whos the person behind the direction of <Moonlight>?",
    "Who helmed the direction of <Mad Max: Fury Road>?",
    "Who is the screenwriter of <Eternal Sunshine of the Spotless Mind>?",
    "Who wrote the screenplay for <Her>?",
    "Which writer was responsible for <The Social Network>?",
    "Whos credited as the screenwriter for <Juno>?",
    "What is the MPAA film rating of <Joker>?",
    "Whats the censorship rating of <The Dark Knight> by MPAA?",
    "What is the genre of <Blade Runner 2049>?",
    "Which genre does <Parasite> belong to?",
]

recommendation_questions_dev = [
    "Recommend movies similar to <The Matrix>.",
    "What are some films similar to <Interstellar>?",
    "Can you suggest movies that are like <Arrival>?",
    "If I enjoyed <Blade Runner 2049>, what other movies would you suggest?",
    "Which films would you recommend if I loved <The Lord of the Rings>?",
    "Given that I like <The Godfather>, <Goodfellas>, <Casino>, can you recommend some movies?",
    "Based on my liking for <Fight Club>, <Se7en>, <Gone Girl>, suggest some films.",
    "I enjoy watching <Amelie>, <La La Land>, and <The Grand Budapest Hotel>. What else would you recommend?",
    "Recommend movies like <Alien>, <Prometheus>, and <The Martian>",
    "Can you suggest films similar to <Saving Private Ryan>, <Platoon>, and <Full Metal Jacket>?",
    "Based on my preference for <Forrest Gump>, <The Shawshank Redemption>, and <The Green Mile>, what else should I watch?",
    "I like <The Silence of the Lambs>, so recommend similar movies",
    "If I enjoyed <The Shining>, what other films would you suggest?",
    "Movies similar to <Psycho> are what I'm looking for. Recommendations?",
]

when_questions_dev = [
    "When was <Titanic> released?",
    "On which date was <The Lion King> released?",
    "Can you tell me when <Schindler's List> was released?",
    "Which day did <Jurassic Park> hit the theaters?",
    "What is the release date of <Avatar>?",
    "Tell me the release date of <The Avengers>.",
    "When exactly was <Harry Potter and the Philosopher's Stone> made available to the public?",
    "Which year was <Star Wars: Episode IV - A New Hope> released?",
    "In what year did <Back to the Future> come out?",
    "When did <Indiana Jones and the Raiders of the Lost Ark> come out?",
    "On which day was <The Empire Strikes Back> first shown?",
    "When was the premiere of <The Godfather>?",
]

multimedia_questions_dev = [
    "Show me a picture of <Leonardo DiCaprio>.",
    "Can you display a photo of <Jennifer Lawrence>?",
    "Id like to see a photograph of <Brad Pitt>.",
    "Provide me with a picture of <Meryl Streep>.",
    "What does <Denzel Washington> look like?",
    "Can you describe <Angelina Jolie's> appearance?",
    "Tell me about <Tom Hanks's> physical appearance.",
    "Let me know what <Charlize Theron> looks like.",
    "Give me an idea of <Johnny Depps> looks.",
    "Provide a description or image of <Natalie Portman>.",
]

In [100]:
movie_questions = closed_questions_dev + recommendation_questions_dev + when_questions_dev 

In [119]:
def replace_quotes(type_question: list, type_entity: list):
    questions = []
    for item in type_entity:
        random_question = type_question[random.randint(0, len(type_question) - 1)]
        movies_to_replace = re.findall(r"<[^<]*>", random_question)
        if len(movies_to_replace) > 1:
            for i, movie in enumerate(movies_to_replace):
                if i == 0:
                    q = random_question.replace(movie, f"<{item}>")
                else:
                    q = q.replace(movie, f"<{type_entity[random.randint(0, len(type_entity) - 1)]}>")
            questions.append(q)
        else:
            questions.append(re.sub(r"<[^<]*<", f"<{item}>", random_question))
    return questions

questions_multimedia_train = replace_quotes(multimedia_questions_dev, movie_names_train)
questions_movies_train = replace_quotes(movie_questions, movie_names_train)
questions_multimedia_dev = replace_quotes(multimedia_questions_dev, movie_names_dev)
questions_movies_dev = replace_quotes(movie_questions, movie_names_dev)
print(len(questions_multimedia_train))
print(len(questions_multimedia_dev))

8000
2000


In [120]:
def find(s, ch1="<", ch2=">"):
    start_indices = [i for i, ltr in enumerate(s) if ltr == ch1]
    end_indices = [i for i, ltr in enumerate(s) if ltr == ch2]
    return start_indices, end_indices


def to_spacy(questions, entity):
    training_data = []
    for q in questions:
        entities = []
        start_indices, end_indices = find(q)
        if len(start_indices) > 1:
            for i in range(len(start_indices)):
                # Calculate the adjustment for each entity
                start_adjustment = start_indices[i] - (2 * i)  # Each entity has 2 symbols < and >
                end_adjustment = end_indices[i] - (2 * (i + 1)) + 1  # Additional -1 for the closing '>'
                entities.append((start_adjustment, end_adjustment, entity))
        
        else:
            start_adjustment = start_indices[0]
            end_adjustment = end_indices[0] - 1
            entities.append((start_adjustment, end_adjustment, entity))

        q = q.replace("<", "").replace(">", "")
        training_data.append((q, {"entities": entities}))
    
    return training_data

training_data = to_spacy(questions_movies_train, "MOVIE")
training_data += to_spacy(questions_multimedia_train, "ACTOR")

dev_data = to_spacy(questions_movies_dev, "MOVIE")
dev_data += to_spacy(questions_multimedia_dev, "ACTOR")




In [125]:
print(training_data[:10])
print(len(training_data))
print(dev_data[:10])
print(len(dev_data))

[('Whats the censorship rating of The Dark Knight by MPAA?', {'entities': [(31, 46, 'MOVIE')]}), ('Given that I like Moondram Pirai, The Young Offenders, Zero Tolerance, can you recommend some movies?', {'entities': [(18, 32, 'MOVIE'), (34, 53, 'MOVIE'), (55, 69, 'MOVIE')]}), ('When was Titanic released?', {'entities': [(9, 16, 'MOVIE')]}), ('Based on my preference for What We Wanted, Vengo, and Exodus, what else should I watch?', {'entities': [(27, 41, 'MOVIE'), (43, 48, 'MOVIE'), (54, 60, 'MOVIE')]}), ('What is the release date of Avatar?', {'entities': [(28, 34, 'MOVIE')]}), ('When did Indiana Jones and the Raiders of the Lost Ark come out?', {'entities': [(9, 54, 'MOVIE')]}), ("Movies similar to Psycho are what I'm looking for. Recommendations?", {'entities': [(18, 24, 'MOVIE')]}), ('Who directed Pulp Fiction?', {'entities': [(13, 25, 'MOVIE')]}), ('Whos the person behind the direction of Moonlight?', {'entities': [(40, 49, 'MOVIE')]}), ('If I enjoyed Blade Runner 2049, what other 

In [128]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

# We will use the previously formatted TRAIN_DATA to create the .spacy file
# The nlp object is created as a blank English model
nlp = spacy.load("en_core_web_sm")

# The DocBin is used to store the documents
db = DocBin()

def to_spacy(data, name):
    x = 0
    for text, annotations in data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations["entities"]:
            span = doc.char_span(start, end, label=label)
            print(span)
            if span is not None:
                ents.append(span)
            else:
                print(text)
                x += 1

        doc.ents = ents
        db.add(doc)

    print(x)
    spacy_file_path = f"./{name}.spacy"
    db.to_disk(spacy_file_path)
    return spacy_file_path

to_spacy(training_data, "train_small")
to_spacy(dev_data, "dev_small")

OSError: [E050] Can't find model 'en_core_web_trf'. It doesn't seem to be a Python package or a valid path to a data directory.

In [2]:
import spacy

nlp1 = spacy.load("../../../models/NER_model-best/")  # load the best model
# input sample text
doc = nlp1("Who is the director of Good Will Hunting?")



Who is the director of Good Will Hunting?


In [15]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Good Will Hunting MOVIE


In [132]:
TEST_DATA = [
    ("Who starred in Titanic?"),
    ("What film did Leonardo DiCaprio act in?"),
    ("Can you name the director of The Shawshank Redemption?"),
    ("Which movie featured Brad Pitt?"),
    ("Who directed Avatar?"),
    ("Who played the lead in Inception?"),
    ("What<s the latest film by Quentin Tarantino?"),
    ("Name a movie starring Natalie Portman."),
    ("Who was the lead actor in Blade Runner 2049?"),
    ("Which film did Tom Hanks act in?"),
    ("What is the genre of Pulp Fiction?"),
    ("Who wrote the screenplay for The Matrix?"),
]

for text in TEST_DATA:
    doc = nlp1(text)
    spacy.displacy.render(doc, style="ent", jupyter=True)

: 