In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.chat_models import ChatOllama
from langchain import PromptTemplate
from langchain_community.vectorstores import FAISS
import json
import os



In [2]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large", model_kwargs={"device": "cpu"}
)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [3]:
with open('segments_updated.json', 'r') as file:
    filtered_segments = json.load(file)
len(filtered_segments)

6020

In [4]:
result = ""
for item in filtered_segments:
    result += item["text"]

print(result)

 OK, good morning, everybody. So last week, we stopped at the question, how do we calculate the gradients for more complex loss functions and classifiers? And we saw that the thing that we need is the backpropagation algorithm. So we want to make use of the chain rule of calculus to determine the gradient of some complex compute graph. So we take one compute graph where we know the gradient for each of the nodes, and we use the chain rule to multiply those individual gradients, and thereby getting the gradient for the more complex function. And if, for example, we use a the compute graph for logistic regression, where we say, OK, we have two inputs, so two parameters for each of the input dimensions, and one bias term, our compute graph is this dot product over here, adding the bias term, getting this linear unit over here, taking the sigma. OK. And then we can use the sigma function of all of this and doing, performing, calculating our loss function. And for each of those steps, we kn

In [5]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)

# Split the documents into chunks
chunks = text_splitter.split_text(result)
len(chunks)

200

In [6]:
def save_json_to_file(data, filename):
    file_path = "chunks_updated.json"
    # Check if the file already exists
    if os.path.exists(file_path):
        print("file already exists")
        # Load existing JSON data from the file
        with open(file_path, "r") as json_file:
            existing_data = json.load(json_file)
    else:
        # If the file doesn't exist, initialize an empty list
        existing_data = []

    # Append the new array to the existing data
    existing_data.extend(chunks)

    # Convert the combined data to JSON format
    json_data = json.dumps(existing_data, indent=4)

    # Write the JSON data back to the file
    with open(file_path, "w") as json_file:
        json_file.write(json_data)

    print(f"JSON file has been updated at {file_path}")

# Example usage:
data = chunks
filename = "chunks_updated.json"
save_json_to_file(data, filename)

JSON file has been updated at chunks_updated.json


In [8]:
with open('chunks_updated.json', 'r') as file:
    chunks = json.load(file)
len(chunks)

200

In [9]:
db2 = FAISS.from_texts(chunks, embeddings)

In [10]:
DEFAULT_SYSTEM_PROMPT = """
You are Karan, a student at TH Bingen University. Act as Karan and reply all questions on his behalf. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. 
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
      <|system|>

      {system_prompt}

      <|user|>

      {prompt}

      <|assistant|>
""".strip()

In [11]:
# DEFAULT_SYSTEM_PROMPT = """
# You are Karan, a student at TH Bingen University. Act as Karan and reply all questions on his basis. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. 
# """.strip()


# def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
#     return f"""
# [INST] <>
# {system_prompt}
# <>

# {prompt} [/INST]
# """.strip()

In [12]:
template = generate_prompt(
    """
{context}

Question: {question}
"""
)

In [13]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [14]:
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOllama(model="llama2:latest"),
    chain_type="stuff",
    retriever=db2.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [15]:
def getStartTimeFromSegments(sourceString):
    i = 0
    searchForMatch = True
    source_strings = sourceString.split(".")
    json_data = filtered_segments  # Move JSON data retrieval outside the loop

    while i < len(source_strings):
        words = source_strings[i].split()

        # Start with 3 words and increase by 1 if no match is found
        num_words = 1
        found_objects = []

        max_num_words = len(words)  # Maximum number of words in the string

        while num_words <= max_num_words:
            # Take the first num_words from the words list and convert them to lowercase
            search_phrase = ' '.join(words[:num_words])

            # Search for matches in the JSON data (case-insensitive)
            found_objects = [obj for obj in json_data if search_phrase in obj['text']]

            # If no matches found or more than 1 found, and num_words doesn't exceed the max number of words, increase the number of words
            if not found_objects or len(found_objects) > 1:
                num_words += 1
            else:
                # Return the first found object and exit the loop
                searchForMatch = False
                return found_objects[0]

        # If num_words exceeds the total number of words in the string, print the first object and break
        if num_words > max_num_words:
            i += 1

    if searchForMatch:
        return json_data[0]


In [20]:
#WITHOUT BUFFER MEMORY
from flask import Flask, request, render_template_string, send_file

app = Flask(__name__)

html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Chatbot</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
            background-color: #f0f0f0;
        }
        .chat-container {
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            background-color: #ffffff;
            border-radius: 10px 10px 0px 0px;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
        }
        .chat-form{
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            background-color: #ffffff;
            border-radius: 0px 0px 10px 10px;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
        }
        .chat-message {
            background-color: #f9f9f9;
            border-radius: 10px;
            padding: 10px;
            margin: 10px 0;
        }
        .user-message {
            text-align: right;
        }
        input[type="text"] {
            width: calc(100% - 100px); /* Adjust width of input */
            padding: 8px;
            border-radius: 5px;
            border: 1px solid #ccc;
            margin-right: 10px;
        }
        .additional-content {
            display: none;
        }
        .show-button{
            background-color: #555555;
            border: none;
            color: white;
            padding: 8px 10px;
            text-align: center;
            text-decoration: none;
            display: inline-block;
            font-size: 16px;
            margin: 4px 2px;
            cursor: pointer;
            border-radius: 5px;
        }
        #input_submit{
            background-color: #555555;
            border: none;
            color: white;
            padding: 8px 10px;
            text-align: center;
            text-decoration: none;
            display: inline-block;
            font-size: 16px;
            margin: 4px 2px;
            cursor: pointer;
            border-radius: 5px;
        }
    </style>
</head>
<body>
    <div class="chat-container" id="chat-container">
    <h3>Welcome to the Artificial Intelligence QnA service.  You may ask me anything 🙃</h3>
        {% for message in chat_history %}
            <div class="chat-message {% if message['sender'] == 'user' %}user-message{% endif %}">
                {{ message['content'] }}
                {% if message['sender'] == 'bot' %}
                    <br><br>
                    <button class="show-button">Show source</button>
                    <div class="additional-content">
                        {{ message['source'] }}
                        <br><br>
                        {{ message['seekTime'] }}
                        <video id="myVideo" width="780" height="640" controls>
                            <source src="http://127.0.0.1:81/Kint2" type="video/mp4">
                            Your browser does not support the video tag.
                        </video>
                    </div>
                    <br><br>
                {% endif %}
            </div>
        {% endfor %}
    </div>
    <div class="chat-form">
    <form action="/" method="POST" id="chat-form">
        <input type="text" name="input_text" id="input_text" placeholder="Message Chatbot...">
        <input type="submit" id="input_submit" value="Send">
    </form>
    </div>
    <script>
        document.getElementById('chat-form').addEventListener('submit', function(event) {
            event.preventDefault(); // Prevent default form submission
            var inputText = document.getElementById('input_text').value;
            if (inputText.trim() !== '') {
                var inputBox = document.getElementById('input_text');
                var chatContainer = document.getElementById('chat-container');
                var userMessage = document.createElement('div');
                userMessage.className = 'chat-message user-message';
                userMessage.textContent = inputText;
                chatContainer.appendChild(userMessage);
                document.getElementById('chat-form').submit(); // Submit form
            }
        });

        //show/hide the source button
        document.querySelectorAll('.show-button').forEach(button => {
            button.addEventListener('click', function() {
                const additionalContent = this.nextElementSibling;
                additionalContent.style.display = additionalContent.style.display === 'block' ? 'none' : 'block';
                this.textContent = additionalContent.style.display === 'block' ? 'Hide Source' : 'Show source';
            });
        });

        //to seek the video
        document.addEventListener('DOMContentLoaded', function () {
        var video = document.getElementById('myVideo');
        var chatHistory = {{ chat_history|tojson }};
        console.log('chatHistory', chatHistory)
        startTime = chatHistory[chatHistory.length - 1]['seekTime']['start'];
        console.log('startTime', startTime)

        // When the video metadata has loaded, set the start time
        video.addEventListener('loadedmetadata', function () {
            video.currentTime = startTime;
        });
    });
    </script>
</body>
</html>
"""

chat_history = []

@app.route('/Kint2')
def video():
    video_path = 'Kint2.mp4'
    return send_file(video_path, mimetype='video/mp4')

@app.route('/', methods=['GET', 'POST'])
def index():
    global chat_history

    if request.method == 'POST':
        input_text = request.form['input_text']
        inputAfterSimilaritySearch = db2.similarity_search(query=input_text, k=2)
        output_text = qa_chain({"input_documents": inputAfterSimilaritySearch[0].page_content, "query": input_text})
        # output_text = qa_chain(input_text)
        chat_history.append({'sender': 'user', 'content': input_text})

        # Add logic here to generate response based on input_text
        seekTime = getStartTimeFromSegments(inputAfterSimilaritySearch[0].page_content)
        response_text = output_text['result']
        chat_history.append({'sender': 'bot', 'content': response_text, 'source': inputAfterSimilaritySearch[0].page_content, 'seekTime': seekTime})

    return render_template_string(html_template, chat_history=chat_history)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=81)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:91
 * Running on http://172.20.10.4:91
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [27/Mar/2024 09:03:52] "GET / HTTP/1.1" 200 -
[2024-03-27 09:03:58,317] ERROR in app: Exception on / [POST]
Traceback (most recent call last):
  File "/Users/karanghai/Library/Python/3.9/lib/python/site-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
  File "/Users/karanghai/Library/Python/3.9/lib/python/site-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/Users/karanghai/Library/Python/3.9/lib/python/site-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 61] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/karanghai/Library/Python/3.9/lib/python/site-packages/urllib3/connectionpool.py", 

CHECK SPLITTED words: ['neural', 'networks']
search_phrase: neural
found_objects:  [{'id': 75, 'seek': 26640, 'start': 271.17999999999995, 'end': 279.4, 'text': ' So, and nothing changes if we make our neural network bigger.', 'video_name': 'Kint3'}, {'id': 77, 'seek': 26640, 'start': 281.26, 'end': 285.64, 'text': ' is a one-layer neural network, works like this.', 'video_name': 'Kint3'}, {'id': 157, 'seek': 55946, 'start': 576.7800000000001, 'end': 580.9000000000001, 'text': ' when we get to like doing proper neural networks.', 'video_name': 'Kint3'}, {'id': 185, 'seek': 64810, 'start': 661.62, 'end': 667.16, 'text': ' So if you think of a larger neural network', 'video_name': 'Kint3'}, {'id': 250, 'seek': 79786, 'start': 823.4, 'end': 825.7, 'text': ' Logistic regression is a one-layer neural network.', 'video_name': 'Kint3'}, {'id': 281, 'seek': 88728, 'start': 907.24, 'end': 909.9399999999999, 'text': ' of another neural network layer,', 'video_name': 'Kint3'}, {'id': 282, 'seek':

127.0.0.1 - - [27/Mar/2024 09:05:55] "POST / HTTP/1.1" 200 -


CHECK SPLITTED words: ['one', 'of', 'the', 'color', 'channels']
search_phrase: one
found_objects:  [{'id': 8, 'seek': 2740, 'start': 31.7, 'end': 34.28, 'text': ' So we take one compute graph where', 'video_name': 'Kint3'}, {'id': 16, 'seek': 5738, 'start': 67.94, 'end': 74.18, 'text': ' of the input dimensions, and one bias term,', 'video_name': 'Kint3'}, {'id': 77, 'seek': 26640, 'start': 281.26, 'end': 285.64, 'text': ' is a one-layer neural network, works like this.', 'video_name': 'Kint3'}, {'id': 102, 'seek': 35636, 'start': 366.18, 'end': 376.58000000000004, 'text': ' So like this times this one over here will in the end just be y hat minus y.', 'video_name': 'Kint3'}, {'id': 110, 'seek': 41004, 'start': 420.68, 'end': 428.20000000000005, 'text': " So the gradient of going towards... of addition, multiplication, it's always just taking one of the numbers.", 'video_name': 'Kint3'}, {'id': 128, 'seek': 46966, 'start': 477.94, 'end': 481.16, 'text': ' everything here drops and this

In [None]:
# from flask import Flask, request, jsonify

# app = Flask(__name__)

# response_text = ''

# @app.route('/', methods=['GET', 'POST'])
# def index():
#     global response_text

#     if request.method == 'POST':
#         input_text = request.form['input_text']
#         inputAfterSimilaritySearch = db2.similarity_search(query=input_text, k=2)
#         output_text = qa_chain({"input_documents": inputAfterSimilaritySearch[0].page_content, "query": input_text})
#         seekTime = getStartTimeFromSegments(inputAfterSimilaritySearch[0].page_content)
#         response_data = {
#             'result': output_text['result'],
#             'source': inputAfterSimilaritySearch[0].page_content,
#             'seekTime': seekTime
#         }
#         return jsonify(response_data)

# if __name__ == '__main__':
#     app.run(host="0.0.0.0", port=81)