In [2]:
from langchain.vectorstores import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.schema import Document
import faiss
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import requests

# Endpointy API
base_url = "https://services.clarin-pl.eu/api/v1/oapi"
models_endpoint = f"{base_url}/models"
completion_endpoint = f"{base_url}/chat/completions"

api_key = ""

# Pobranie listy modeli
headers = {"Authorization": f"Bearer {api_key}"}
response = requests.get(models_endpoint, headers=headers)
if response.status_code == 200:
    models = response.json()
    print("Dostępne modele:", models)
else:
    print("Nie udało się pobrać modeli:", response.text)


Dostępne modele: {'data': [{'id': 'bielik', 'full_name': 'speakleash/Bielik-11B-v2.2-Instruct', 'name': 'speakleash/Bielik-11B-v2.2-Instruct'}, {'id': 'cohere', 'full_name': 'CohereForAI/c4ai-command-r-plus', 'name': 'CohereForAI/c4ai-command-r-plus'}, {'id': 'claude-haiku', 'full_name': 'claude-3-haiku-20240307', 'name': 'claude-3-haiku-20240307'}, {'id': 'gpt-3.5', 'full_name': 'gpt-3.5-turbo-0125', 'name': 'gpt-3.5-turbo-0125'}, {'id': 'gpt-4o', 'full_name': 'gpt-4o-2024-08-06', 'name': 'gpt-4o-2024-08-06'}, {'id': 'claude-sonnet', 'full_name': 'claude-3-5-sonnet-20240620', 'name': 'claude-3-5-sonnet-20240620'}, {'id': 'gpt-4o-mini', 'full_name': 'gpt-4o-mini-2024-07-18', 'name': 'gpt-4o-mini-2024-07-18'}, {'id': 'claude-opus', 'full_name': 'claude-3-opus-20240229', 'name': 'claude-3-opus-20240229'}, {'id': 'llama3.1-8b', 'full_name': 'meta-llama/Llama-3.1-8B-Instruct', 'name': 'meta-llama/Llama-3.1-8B-Instruct'}, {'id': 'llama', 'full_name': 'meta-llama/Meta-Llama-3.1-70B-Instruct'

In [5]:
# Załaduj dane IMDB
imdb_data = load_dataset("stanfordnlp/imdb", split="train[:1000]")
# imdb_data['text'][0]
# imdb_data


In [6]:
# Tokenizer i model
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to("cuda")


In [7]:
# Funkcja do generowania osadzeń w partiach (batchach)
def get_embeddings_batch(text_list, batch_size=16):
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        encoded = tokenizer(batch, truncation=True, padding=True, return_tensors="pt").to("cuda")
        with torch.no_grad():
            embeddings = model(**encoded).last_hidden_state[:, 0].cpu().numpy()
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)


# Generowanie osadzeń dla tekstów IMDB (przetwarzanie partiami)
texts = imdb_data["text"]
embeddings = get_embeddings_batch(texts)


In [8]:
# Tworzenie FAISS Index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

#dodanie do datasetu
imdb_data = imdb_data.add_column("embeddings", embeddings.tolist())  
# imdb_data.add_faiss_index(column="embeddings")  


In [9]:
# Wyszukiwanie semantyczne z FAISS
query = "Which movie has the best plot twists?"
query_embedding = get_embeddings_batch([query])
distances, indices = index.search(query_embedding, k=5)
print("Najbliższe wyniki:", indices)

Najbliższe wyniki: [[373 268 297 272 285]]


In [10]:
nearest_indices = indices[0]  # Indeksy najbliższych wyników

nearest_reviews = [imdb_data["text"][i] for i in nearest_indices]

print("Najbliższe recenzje filmów:")
for idx, review in enumerate(nearest_reviews, 1):
    print(f"{idx}. {review}\n")


Najbliższe recenzje filmów:
1. You'd better choose Paul Verhoeven's even if you have watched it.

2. The worst movie I have seen in a while. Yeah its fun to fantasize, but if that is what you are looking for, I suggest you see Brewsters Millions. This was just terrible and corny and terrible at being corny. Unless you are five or like terrible movies, don't see this one.

3. This movie is so bad, I knew how it ends right after this little girl killed the first person. Very bad acting very bad plot very bad movie<br /><br />do yourself a favour and DON'T watch it 1/10

4. A young boy comes into a lot of money and promptly begins to live it up. Unfortunately, the man whose money it really is happens to be very bad. He wants his loot back. When he discovers who has the bucks, he begins trying to get it back. He keeps getting foiled by this little kid who is just lucky enough to keep from falling into the evil man's hands. Sounds familiar, I'll bet. Very predictable, not interesting at all

### Langchain

In [11]:
# Funkcja do konwersji tekstów na dokumenty LangChain
def convert_texts_to_documents(texts):
    return [Document(page_content=text) for text in texts]

# Tworzenie FAISS Index dla LangChain
def build_faiss_index(embeddings, texts):
    # Zamiana tekstów na obiekty Document
    documents = convert_texts_to_documents(texts)
    
    # Tworzenie docstore i mapowania ID
    docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})
    index_to_docstore_id = {i: str(i) for i in range(len(documents))}
    
    # Tworzenie FAISS Index
    dimension = embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)  # Tworzenie indeksu wektorowego
    faiss_index.add(np.array(embeddings))      # Dodanie osadzeń
    
    # Zwracanie FAISS index z LangChain
    return FAISS(embedding_function=None, index=faiss_index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)

# Budowa indeksu FAISS
vectorstore = build_faiss_index(embeddings, imdb_data["text"])

# Funkcja do wyszukiwania dokumentów
def search_faiss(query, k=5):
    query_embedding = get_embeddings_batch([query])
    docs = vectorstore.similarity_search_by_vector(query_embedding[0], k=k)
    return docs

# Generacja odpowiedzi z API
def fetch_response_from_api(prompt, model_id="gpt-4o-mini"):
    payload = {
        "model": model_id,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.7,
        "max_tokens": 300,
    }
    headers = {"Authorization": f"Bearer {api_key}"}
    response = requests.post(completion_endpoint, json=payload, headers=headers)
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        print("API Error:", response.text)
        return "Error in fetching response from the model."

# Tworzenie promptu
def create_prompt(query, documents):
    prompt_template = """
    Please answer the following question based on the provided document

    Question: {question}
    Retrieved Documents: {documents}

    Answer:"""
    docs_content = "\n".join([doc.page_content for doc in documents])
    return prompt_template.format(question=query, documents=docs_content)

# Główna funkcja RAG
def rag_pipeline(query, k=5):
    retrieved_docs = search_faiss(query, k=k)
    prompt = create_prompt(query, retrieved_docs)
    answer = fetch_response_from_api(prompt)
    return answer


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


# Ogólne pytania

In [62]:
query = "Which are the best horror movies?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The best horror movies mentioned in the document include classics such as "Carrie," "Friday the 13th," "Halloween," "Scream," and "Scary Movie." These films are recommended for their engaging plots, original twists, and ability to provide genuine scares, contrasting significantly with the low-quality films criticized in the text. Other notable classics referenced are "Dracula," "Frankenstein," "Freaks," "The Golem," and "The Ghost Train," which are highlighted for their historical significance and contribution to the horror genre.


In [63]:
query = "What are some classic films everyone should watch?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: Based on the provided document, some classic films that are mentioned and recommended for viewing include:

1. **"Star 80"** - Though not universally praised, it is noted for its intriguing subject matter.
2. **"Paper Moon"** - Recommended as a film that can be watched repeatedly.
3. **"The Last Picture Show"** - Also suggested as a film deserving multiple viewings.
4. **"Animal House"** - A classic sophomoric comedy that is highly regarded.
5. **"Caddyshack"** - Another classic comedy featuring notable performances by Rodney Dangerfield and Bill Murray.

These films represent a mix of genres and are considered significant in their contributions to cinema.


# Abstrakcyjne pytania

In [None]:
query = "Which movies discuss the complexity of human emotions?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The document does not explicitly mention specific movies that discuss the complexity of human emotions. However, it does critique "The Color Purple," suggesting that it attempts to explore themes related to human emotions, particularly through the experiences of the characters in the context of racism, sexism, and relationships. The document criticizes the film for its portrayal of these themes and the way it handles emotional narratives. Therefore, while the document may not provide a clear list of movies, "The Color Purple" can be inferred as a film that engages with complex human emotions, albeit in a controversial manner.


In [None]:
query = "Which movies explore the struggles of family relationships?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The movies that explore the struggles of family relationships based on the provided document include:

1. **The film featuring Rosanna Arquette** - This movie depicts the volatile relationship between a mother and her daughter, who has a history of violence and an obsessive bond with her mother, leading to tragic outcomes.

2. **The film with Dennis Quaid and Arliss Howard** - This film focuses on the estrangement between two brothers, highlighting their childhood differences and the strain in their relationship, with a romantic interest caught in the middle.

3. **The dysfunctional family holiday film** - This movie portrays a dysfunctional family that comes together for the holidays, resulting in chaos and violence, reflecting the underlying tensions in family dynamics.

These films all delve into complex family relationships and the struggles that arise within them.


In [None]:
query = "What are some inspiring movies for students?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The provided document does not contain any inspiring movies for students. Instead, it focuses on critiques of poorly made films, highlighting their shortcomings and lack of originality. If you're looking for inspiring movies for students, consider classics such as "Dead Poets Society," "The Pursuit of Happyness," "Freedom Writers," or "A Beautiful Mind," which explore themes of perseverance, creativity, and personal growth.


# Pytania konkretne/szczegółowe

In [65]:
query = "Which movies are set in space?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The document does not explicitly mention any specific movies set in space. It mainly discusses the poor quality of a movie called "Mysterious Planet" and includes some general commentary about other films without identifying those that are set in space. Therefore, based on the provided content, it is not possible to provide a list of movies set in space.


In [66]:
query = "What movies take place in New York City?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The movie mentioned in the document that takes place in New York City is "New York, I Love You." The text discusses various segments of the film and critiques its portrayal of the city, indicating that it is set primarily in Manhattan.


In [None]:
query = "Which movies have strong female protagonists?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The provided document does not explicitly mention any movies with strong female protagonists. Instead, it critiques various films, highlighting their shortcomings and the portrayal of female characters. If you're looking for films that feature strong female protagonists, you might consider titles like "Wonder Woman," "Mad Max: Fury Road," "The Hunger Games," "Frozen," or "Hidden Figures," which are known for having powerful female leads.


In [None]:
query = "Which movies are set in the 19th century?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The only movie mentioned that is set in the 19th century is "A River Runs Through It."


In [None]:
query = "Which movies are set in the 20th century?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The document does not explicitly mention any specific movies set in the 20th century. However, it does reference the movie "The Shining," which was released in 1980 and is set in the 20th century. Other classic horror films mentioned, such as "Dracula" and "Frankenstein," are also set in the 20th century.


In [None]:
query = "What is the main theme of 'Friday the 13th'?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The main theme of 'Friday the 13th', as inferred from the provided document, revolves around the critique of poor filmmaking in the horror genre. Reviewers express their disappointment with the film's clichéd plot, lackluster acting, and overall execution, suggesting that it fails to deliver the suspense and thrills typically expected from horror movies. The consensus indicates that the film is lacking in originality, creativity, and effective scares, leading to a negative viewing experience.


In [None]:
query = "What is the main plot in 'Scream'?"
answer = rag_pipeline(query)
print("Final Answer:", answer)

Final Answer: The main plot of 'Scream' is not explicitly detailed in the retrieved documents, but based on general knowledge, 'Scream' revolves around a masked killer who targets high school students in a small town, using horror movie tropes to commit murders. The story follows a group of friends, particularly the protagonist, as they navigate the terror while trying to uncover the identity of the killer. The film is known for its self-referential humor and commentary on the horror genre.


**Odpowiedzi na pytania ogólne** - zawierają ogólne rekomendacje filmów, ale mogą być subiektywne i nie zawsze pełne, w zależności od kontekstu i źródła.

**Odpowiedzi na pytania abstrakcyjne** - wiele odpowiedzi wskazuje na brak danych w dokumencie, sugerując filmy powszechnie uznawane za inspirujące, ale wymagające dodatkowego kontekstu.

**Odpowiedzi na pytania szczegółowe** - często brakuje wystarczających danych w źródle, co utrudnia udzielenie dokładnej odpowiedzi. Odpowiedzi mogą być subiektywne lub ogólne.

**Czyli ogolnie** - brak pełnych odpowiedzi wynika z ograniczonego źródła danych. RAG może działać dobrze w przypadku ogólnych pytań, ale wymaga bardziej szczegółowych danych do precyzyjnych odpowiedzi.