In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# This is the YouTube video we're going to use.
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=F1N1TpnR5mI&ab_channel=Three-EyedRaven"

In [3]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [7]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser


In [None]:
import tempfile
import whisper
from pytube import YouTube


# Let's do this only if we haven't created the transcription file yet.
if not os.path.exists("got_transcription.txt"):
    youtube = YouTube(YOUTUBE_VIDEO)
    audio = youtube.streams.filter(only_audio=True).first()

    # Let's load the base model. This is not the most accurate
    # model but it's fast.
    whisper_model = whisper.load_model("base")

    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

        with open("got_transcription.txt", "w") as file:
            file.write(transcription)

In [10]:
import re

# Read the transcript from the file
with open('timestamp.txt', 'r') as file:
    transcript = file.read()

# Define a regular expression to match timestamps (e.g., [00:00:05])
timestamp_pattern = r'\[?\d{1,2}:\d{2}(:\d{2})?\]?'

# Remove all timestamps from the transcript
cleaned_transcript = re.sub(timestamp_pattern, '', transcript)

# Optionally, remove extra spaces and newlines left behind after removing timestamps
cleaned_transcript = re.sub(r'\s+', ' ', cleaned_transcript).strip()

# Save the cleaned transcript to a new file
with open('got_transcription.txt', 'w') as file:
    file.write(cleaned_transcript)




In [12]:
import re
import os

# Read the transcript from the file
with open('timestamp.txt', 'r') as file:
    transcript = file.read()

# Define a regular expression to match timestamps (e.g., [00:00:05])
timestamp_pattern = r'\[?\d{1,2}:\d{2}(:\d{2})?\]?'

# Remove all timestamps from the transcript
cleaned_transcript = re.sub(timestamp_pattern, '', transcript)

# Optionally, remove extra spaces and newlines left behind after removing timestamps
cleaned_transcript = re.sub(r'\s+', ' ', cleaned_transcript).strip()

# Save the cleaned transcript to a new file if it's not already present
output_file = 'got_transcription.txt'
if not os.path.exists(output_file):
    with open(output_file, 'w') as file:
        file.write(cleaned_transcript)
else:
    # Check if the cleaned transcript is not already present in the file
    with open(output_file, 'r') as file:
        existing_transcript = file.read()
    if cleaned_transcript not in existing_transcript:
        with open(output_file, 'a') as file:
            file.write('\n' + cleaned_transcript)


In [14]:
with open("got_transcription.txt") as file:
    transcription = file.read()

transcription[:100]

"\nRobert's Rebellion also known as The War of the usurper marked a significant revolt against House T"

In [15]:
try:
    chain.invoke({
        "context": transcription,
        "question": "Is reading papers a good idea?"
    })
except Exception as e:
    print(e)

Invalid input type <class 'dict'>. Must be a PromptValue, str, or list of BaseMessages.


In [17]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("got_transcription.txt")
text_documents = loader.load()
text_documents



In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)[:5]

In [23]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [26]:
from langchain_community.vectorstores import DocArrayInMemorySearch
vectorstore2 = DocArrayInMemorySearch.from_documents(documents, embeddings)

In [40]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("is the mad king dead or alive at the end")

'The Mad King is dead at the end.'

In [32]:
from langchain_pinecone import PineconeVectorStore

index_name = "rag"

pinecone = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

  from tqdm.autonotebook import tqdm


In [33]:
pinecone.similarity_search("What is the rebellion about")[:3]

[Document(page_content="Robert's Rebellion also known as The War of the usurper marked a significant revolt against House Targaryen that took place about 17 years before the war of the five Kings for almost a year the Seven Kingdoms bore witness to this Monumental battle culminating in the eclipse of the targaryan Dynasty and the rise of Robert baran's reign in this video we will be covering the complete story of Robert's Rebellion from the birth of rhaegar targar to the Mad King's death and if you like everything related to the Game of Thrones universe and The Song of Ice and Fire don't forget to subscribe to this channel welcome to the three-eyed Raven before we begin this video we would like to thank our loyal audience who supports with their views your continued support means a lot to us if you would like to support our Channel even more as we continue to create the content we all love there are two additional ways to do so by placing an order in our merchandise store or by purchas

In [42]:
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("who is regar")

'Rhaegar is a character mentioned in the context provided. He is described as an emblem of hope and tragedy for House Targaryen, born amidst tragedy at Summer Hall, and believed by some to be the prince that was promised.'

In [37]:
# import openai
# import requests
# import numpy as np
# from sentence_transformers import SentenceTransformer, util

# # # Load pre-existing prompts from the danielmiessler/fabric repository
# # def load_prompts():
# #     response = requests.get("https://github.com/danielmiessler/fabric/blob/main/patterns/extract_wisdom/system.md")
# #     if response.status_code == 200:
# #         return response.text
# #     else:
# #         raise Exception("Failed to load prompts from the repository")
    
# def load_prompts():
#     response = requests.get("https://github.com/danielmiessler/fabric/blob/main/patterns/extract_wisdom/system.md")
#     if response.status_code == 200:
#     # Access the content of the response using the .text attribute (without parentheses)
#         content = response.text
#     # Now you can work with the content, for example, you can parse the Markdown content
#     # Or you can simply return the content
#         return content
#     else:
#             raise Exception("Failed to load prompts from the repository")


# # # Use semantic search to find relevant prompts
# # def semantic_search(prompts, query, model):
# #     prompt_texts = [prompt['text'] for prompt in prompts]
# #     query_embedding = model.encode(query)
# #     prompt_embeddings = model.encode(prompt_texts)
# #     scores = util.pytorch_cos_sim(query_embedding, prompt_embeddings)[0]
# #     top_k = np.argsort(scores, axis=0)[-5:][::-1]
# #     return [prompts[idx] for idx in top_k]

# # def semantic_search(prompts, query, model):
# #     # Split the string into individual prompts
# #     prompt_texts = prompts.split('\n')
# #     query_embedding = model.encode(query)
# #     prompt_embeddings = model.encode(prompt_texts)
# #     # Perform semantic search and return relevant prompts
# #     # (implementation of semantic search goes here)

# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# def semantic_search(prompts, query, model):
#     # Encode the query
#     query_embedding = model.encode(query).reshape(1, -1)
    
#     # Encode the prompts
#     prompt_embeddings = np.array([model.encode(prompt).flatten() for prompt in prompts])
    
#     # Compute cosine similarity between query and prompts
#     similarities = cosine_similarity(query_embedding, prompt_embeddings)
    
#     # Sort prompts by similarity score (descending order)
#     sorted_indices = np.argsort(similarities[0])[::-1]
    
#     # Return relevant prompts sorted by similarity score
#     relevant_prompts = [prompts[i] for i in sorted_indices]
    
#     return relevant_prompts


# # Generate multiple prompt options
# def generate_prompts(objective, scenarios, model):
#     prompts = load_prompts()
#     generated_prompts = []
    
#     for scenario in scenarios:
#         query = f"{objective}. Scenario: {scenario}"
#         relevant_prompts = semantic_search(prompts, query, model)
#         generated_prompts.append(relevant_prompts)
    
#     return generated_prompts


In [39]:
import requests
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import openai

def load_prompts():
    response = requests.get("https://github.com/danielmiessler/fabric/blob/main/patterns/extract_wisdom/system.md")
    if response.status_code == 200:
    # Access the content of the response using the .text attribute (without parentheses)
        content = response.text
    # Now you can work with the content, for example, you can parse the Markdown content
    # Or you can simply return the content
        return content
    else:
            raise Exception("Failed to load prompts from the repository")

def load_transcript(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        transcript = file.read()
    return transcript

# Load the transcript from transcript.txt
transcript_file_path = 'got_transcription.txt'  
transcript = load_transcript(transcript_file_path)

def semantic_search(prompts, query, model):
    # Encode the query
    query_embedding = model.encode(query).reshape(1, -1)
    
    # Encode the prompts
    prompt_embeddings = np.array([model.encode(prompt).flatten() for prompt in prompts])
    
    # Compute cosine similarity between query and prompts
    similarities = cosine_similarity(query_embedding, prompt_embeddings)
    
    # Sort prompts by similarity score (descending order)
    sorted_indices = np.argsort(similarities[0])[::-1]
    
    # Return relevant prompts sorted by similarity score
    relevant_prompts = [prompts[i] for i in sorted_indices]
    
    return relevant_prompts

# Generate multiple prompt options
def generate_prompts(transcript, objective, scenarios, model):
    prompts = load_prompts()
    generated_prompts = []
    
    for scenario in scenarios:
        query = f"{objective}. Scenario: {scenario}"
        relevant_prompts = semantic_search(prompts, query, model)
        generated_prompts.append(relevant_prompts)
    
    return generated_prompts

In [40]:
# Load prompts from the GitHub repository
prompts = load_prompts()

In [41]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')




In [42]:
objective = "To learn as much as possible about the rebellion"
scenarios = [
    "providing answers about the different houses",
    "explaning why the rebellion took place",
    "telling who did what during the rebellion"
]


In [None]:
generated_prompts = generate_prompts(transcript, objective, scenarios, model)


In [64]:
import os
import glob

def read_prompts_from_folder(folder_path):
    # Use glob to find all .md files in the folder and subfolders
    file_pattern = os.path.join(folder_path, '**', '*.md')
    prompt_files = glob.glob(file_pattern, recursive=True)

    prompts = []
    for file_path in prompt_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            prompt = file.read().strip()
            prompts.append(prompt)

    return prompts

def display_prompts(prompts):
    print("Displaying all prompts:")
    for prompt in prompts:
        print(prompt)
        print("-" * 50)

def search_prompts(prompts, keyword):
    print(f"Searching for '{keyword}' in prompts:")
    for prompt in prompts:
        if keyword.lower() in prompt.lower():
            print(prompt)
            print("-" * 50)

def transform_prompts(prompts, transformation):
    transformed_prompts = [transformation(prompt) for prompt in prompts]
    return transformed_prompts

def save_prompts(prompts, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for prompt in prompts:
            file.write(prompt + "\n\n")
    print(f"Prompts saved to {output_file}")

# Define a sample transformation function
def to_upper_case(prompt):
    return prompt.upper()

# Define the path to the folder containing prompt files
folder_path = 'pattern'  # Replace with the actual path to your 'pattern' folder

# Read the prompts from the folder
prompts = read_prompts_from_folder(folder_path)

# Interact with the prompts
display_prompts(prompts)
search_prompts(prompts, keyword='Robert') 

# Transform the prompts 
transformed_prompts = transform_prompts(prompts, to_upper_case)
display_prompts(transformed_prompts)

# Save the transformed prompts to a new file
save_prompts(transformed_prompts, 'transformed_prompts.md')

print(transformed_prompts)

# Print the generated prompts
for idx, prompt in enumerate(prompts):
    print(f"Prompt {idx + 1}: {prompts}")


Displaying all prompts:
Searching for 'Robert' in prompts:
Displaying all prompts:
Prompts saved to transformed_prompts.md
[]


In [46]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk import pos_tag, ne_chunk
from collections import defaultdict

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def load_transcript(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        transcript = file.read()
    return transcript

def preprocess_text(text):
    # Remove non-alphabetic characters and convert text to lowercase
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    return text

def extract_key_phrases(transcript):
    # Tokenize transcript into sentences
    sentences = sent_tokenize(transcript)

    # Tokenize words and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(transcript)
    words = [word for word in words if word not in stop_words and word.isalpha()]

    # Find the most common words
    freq_dist = FreqDist(words)
    common_words = freq_dist.most_common(20)

    # Extract named entities
    named_entities = set()
    for sentence in sentences:
        chunks = ne_chunk(pos_tag(word_tokenize(sentence)))
        for chunk in chunks:
            if hasattr(chunk, 'label'):
                named_entities.add(' '.join(c[0] for c in chunk))

    return common_words, named_entities

def generate_prompts_from_transcript(transcript):
    prompts = []

    # Preprocess the transcript
    cleaned_transcript = preprocess_text(transcript)
    
    # Extract key phrases
    common_words, named_entities = extract_key_phrases(cleaned_transcript)

    # Generate prompts using common words and named entities
    for word, _ in common_words:
        prompts.append(f"What is the significance of {word} in Robert's Rebellion?")
        prompts.append(f"How does {word} affect the outcome of Robert's Rebellion?")
    
    for entity in named_entities:
        prompts.append(f"Who is {entity} and what role did they play in Robert's Rebellion?")
        prompts.append(f"Describe the actions of {entity} during Robert's Rebellion.")
    
    return prompts

# Load the transcript from a file
transcript_file_path = 'got_transcription.txt'  
transcript = load_transcript(transcript_file_path)

# Generate prompts from the transcript
prompts = generate_prompts_from_transcript(transcript)

# Print the generated prompts
for idx, prompt in enumerate(prompts):
    print(f"Prompt {idx + 1}: {prompt}")


[nltk_data] Downloading package punkt to /home/henock/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/henock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/henock/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/henock/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/henock/nltk_data...
[nltk_data]   Package words is already up-to-date!


Prompt 1: What is the significance of rhaegar in Robert's Rebellion?
Prompt 2: How does rhaegar affect the outcome of Robert's Rebellion?
Prompt 3: What is the significance of king in Robert's Rebellion?
Prompt 4: How does king affect the outcome of Robert's Rebellion?
Prompt 5: What is the significance of robert in Robert's Rebellion?
Prompt 6: How does robert affect the outcome of Robert's Rebellion?
Prompt 7: What is the significance of aries in Robert's Rebellion?
Prompt 8: How does aries affect the outcome of Robert's Rebellion?
Prompt 9: What is the significance of kings in Robert's Rebellion?
Prompt 10: How does kings affect the outcome of Robert's Rebellion?
Prompt 11: What is the significance of targaryen in Robert's Rebellion?
Prompt 12: How does targaryen affect the outcome of Robert's Rebellion?
Prompt 13: What is the significance of fire in Robert's Rebellion?
Prompt 14: How does fire affect the outcome of Robert's Rebellion?
Prompt 15: What is the significance of mad in R

In [47]:
import re
import nltk
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy model for named entity recognition
nlp = spacy.load("en_core_web_sm")

def load_transcript(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        transcript = file.read()
    return transcript

def preprocess_text(text):
    # Remove non-alphabetic characters and convert text to lowercase
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    return text

def extract_key_phrases(transcript):
    # Tokenize transcript into sentences
    sentences = sent_tokenize(transcript)

    # Tokenize words and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(transcript)
    words = [word for word in words if word not in stop_words and word.isalpha()]

    # Find the most common words
    freq_dist = FreqDist(words)
    common_words = freq_dist.most_common(20)

    # Extract named entities using spaCy
    named_entities = set()
    doc = nlp(transcript)
    for ent in doc.ents:
        named_entities.add(ent.text)

    return common_words, named_entities

def generate_prompts_from_transcript(transcript):
    prompts = []

    # Preprocess the transcript
    cleaned_transcript = preprocess_text(transcript)
    
    # Extract key phrases
    common_words, named_entities = extract_key_phrases(cleaned_transcript)

    # Generate prompts using common words and named entities
    for word, _ in common_words:
        prompts.append(f"What is the significance of {word} in Robert's Rebellion?")
        prompts.append(f"How does {word} affect the outcome of Robert's Rebellion?")
    
    for entity in named_entities:
        prompts.append(f"Who is {entity} and what role did they play in Robert's Rebellion?")
        prompts.append(f"Describe the actions of {entity} during Robert's Rebellion.")
        prompts.append(f"What were the motivations of {entity} in the context of Robert's Rebellion?")
        prompts.append(f"How did {entity}'s actions influence the events of Robert's Rebellion?")

    # Additional context-aware prompts
    prompts.append(f"How did the key events unfold during Robert's Rebellion?")
    prompts.append(f"What were the major turning points in Robert's Rebellion?")
    prompts.append(f"How did the outcome of Robert's Rebellion impact the realm?")
    prompts.append(f"What were the consequences of Robert's Rebellion for the major houses involved?")

    return prompts

# Load the transcript from a file
transcript_file_path = 'got_transcription.txt'    
transcript = load_transcript(transcript_file_path)

# Generate prompts from the transcript
prompts = generate_prompts_from_transcript(transcript)

# Print the generated prompts
for idx, prompt in enumerate(prompts):
    print(f"Prompt {idx + 1}: {prompt}")


[nltk_data] Downloading package punkt to /home/henock/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/henock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Prompt 1: What is the significance of rhaegar in Robert's Rebellion?
Prompt 2: How does rhaegar affect the outcome of Robert's Rebellion?
Prompt 3: What is the significance of king in Robert's Rebellion?
Prompt 4: How does king affect the outcome of Robert's Rebellion?
Prompt 5: What is the significance of robert in Robert's Rebellion?
Prompt 6: How does robert affect the outcome of Robert's Rebellion?
Prompt 7: What is the significance of aries in Robert's Rebellion?
Prompt 8: How does aries affect the outcome of Robert's Rebellion?
Prompt 9: What is the significance of kings in Robert's Rebellion?
Prompt 10: How does kings affect the outcome of Robert's Rebellion?
Prompt 11: What is the significance of targaryen in Robert's Rebellion?
Prompt 12: How does targaryen affect the outcome of Robert's Rebellion?
Prompt 13: What is the significance of fire in Robert's Rebellion?
Prompt 14: How does fire affect the outcome of Robert's Rebellion?
Prompt 15: What is the significance of mad in R