# Downloading and Importing relevant libraries

In [1]:
# @title
!pip install semantic-split
!pip install nltk beautifulsoup4
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-dbyvtv30/unsloth_f6446ef364a44b8f8ca0556782791b65
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-dbyvtv30/unsloth_f6446ef364a44b8f8ca0556782791b65
  Resolved https://github.com/unslothai/unsloth.git to commit 12b437e12204532f82542c12ac1ab00d19e3ebbf
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
# @title
import requests
import json
from semantic_split import SimilarSentenceSplitter, SentenceTransformersSimilarity, SpacySentenceSplitter
import shutil, os, subprocess
from unsloth import FastLanguageModel
import torch
import re
import nltk
from bs4 import BeautifulSoup
import gc
import torch
import transformers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# LOADING THE DATA

In [3]:
# URL of the raw JSON file
url = "https://raw.githubusercontent.com/this-is-mjk/OOSC/main/dataSet/data.json"

# Website url to scape from
scrapped_url = "https://medium.com/@bijit211987/chunking-strategies-for-fine-tuning-llms-30d2988c3b7a"

response = requests.get(url)
data = response.json()
text = data['data']

# Semantic Chunking to split the text data into multiple parts

In [4]:
model = SentenceTransformersSimilarity()
sentence_splitter = SpacySentenceSplitter()
splitter = SimilarSentenceSplitter(model, sentence_splitter)
res = splitter.split(text)

filtered_res=[]
filtered_res = [" ".join(chunk) for chunk in res if sum(len(sentence) for sentence in chunk) >= 30]

# Splitting the text into roughly 50 semantically similar chunks

parts= int((len(filtered_res))/50)+1

double_filtered = []
curr = ""

for i, chunk in enumerate(filtered_res):
    curr += chunk + " "
    if (i + 1) % parts == 0:
        double_filtered.append(curr.strip())
        curr = ""

if curr:
    double_filtered.append(curr.strip())

# Splitting chunks with over 15k characters into multiple chunks

final_filtered = []
for chunk in double_filtered:
    if len(chunk) > 15000:
        sub_chunks = splitter.split(chunk)
        temp_curr = ""
        for sub_chunk in sub_chunks:
            temp_curr += " ".join(sub_chunk) + " "
            if len(temp_curr) >= 15000:
                final_filtered.append(temp_curr.strip())
                temp_curr = ""
        if temp_curr:
            final_filtered.append(temp_curr.strip())
    else:
        final_filtered.append(chunk)

print(f"Total number of chunks: {len(final_filtered)}")



Total number of chunks: 49


# Model definition

## Using Llama 3.1 8B Instruct

In [5]:
max_seq_length = 2048
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,),

# Prompt for question generation

In [6]:
prompt = """You are a chatbot trained on contents of a website.
    Based on the provided document, give me a question that focuses on most important part of the code i provide you. The document includes various types of information such as text, data, or multimedia descriptions. Focus on generating question that address the key themes, details, and implications of the content. Ensure that the questions cover different aspects, including:
    General Understanding: Questions that test the basic understanding of the content.
    Specific Details: Questions that probe specific facts or figures mentioned in the content.
    Implications: Questions that explore the potential implications or consequences of the information presented.
    Comparison: Questions that compare and contrast different pieces of information or viewpoints within the content.
    Critical Thinking: Questions that encourage deeper analysis or critique of the information.
    Make sure the questions are clear, relevant, and suitable for the intended audience.

    DO NOT USE ANY PRE-EXISTING KNOWLEDGE!!! Only use the knowledge I have provided you with.

    Provide small questions only.


    DO NOT HALLUCINATE !!!


Paragraph:{}
Question:{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

# Pre-processing of input text data

In [7]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = ' '.join(text.split())
    tokens = nltk.word_tokenize(text)
    text = BeautifulSoup(' '.join(tokens), "html.parser").text
    return text

# Question generation

In [8]:
gc.collect()
torch.cuda.empty_cache()

questions_list = []
i = 0

for chunk in final_filtered:
    i += 1

    # Skipping the first and last two chunks are they contain header and footer details
    if i <= 2:
        continue
    if i >= (len(final_filtered) - 2):
        continue
    chunk_new = preprocess_text(chunk)
    inputs = tokenizer([prompt.format(chunk_new,"",)], return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=15,
        use_cache=True,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    question_prefix = "Question:"
    question_start = outputs[0].find(question_prefix) + len(question_prefix)
    question = outputs[0][question_start:].strip()
    question_mark_index = question.find('?')
    if question_mark_index != -1:
        question = question[:question_mark_index + 1]

    print(f"({i}) {question}")
    questions_list.append(question)

(3) What is one of main reasons why chunk strategies are particularly beneficial in handling very
(4) What is input chucking in llmlong short term memory lstm gated rec
(5) What is a key benefit of using chunk-based processing in language modeling?
(6) What is one of key challenges in working with large language Models?
(7) What is output chucking, according to the text?
(8) What is a key benefit of using chunk-based processing in large transformer models?
(9) What is one of main applications of input and output chunkin in llMs
(10) What is output chucking in llmam and how does it help with memory
(11) What is a key strategy employed by retrieval augmented generation to optimize the retrieval and
(12) What is input chucking in chunked strategies, as per the given text
(13) What is output chucking in this context and what is its main benefit?
(14) What is a key benefit of using chunk-based approaches in memory-intensive transformer models
(15) What is the main challenge that chunked str

# Grouping similar questions using K-means clustering

In [9]:
statements = list(questions_list)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(statements)
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_
unique, counts = np.unique(labels, return_counts=True)
frequency = dict(zip(unique, counts))
top_clusters = sorted(frequency.items(), key=lambda x: x[1], reverse=True)[:10]
cluster_statements = {i: [] for i in range(num_clusters)}
for i, label in enumerate(labels):
    cluster_statements[label].append(statements[i])
final_questions = []
for cluster_id, freq in top_clusters:
    if cluster_statements[cluster_id]:
        print(f" - {cluster_statements[cluster_id][0]}")
        final_questions.append(cluster_statements[cluster_id][0])

  super()._check_params_vs_input(X, default_n_init=10)


 - What is a key benefit of using chunk-based processing in language modeling?
 - What is one of key challenges in working with large language Models?
 - What is input chucking in llmlong short term memory lstm gated rec
 - What is a common technique used in attention-based language modeling to optimize attention computations
 - What is output chucking, according to the text?
 - What is a key strategy employed by retrieval augmented generation to optimize the retrieval and
 - What is one of main applications of input and output chunkin in llMs
 - What is a key benefit of using recursive Transformer architectures in tasks like document translation
 - What is one of main reasons why chunk strategies are particularly beneficial in handling very
 - What is one way chunked input can improve memory performance during LLm processing


# Using sentence transformer model to identity relevant links from the generated questions

In [10]:
text = data['relevant_link']
link_titles = []
for i in text:
    link_titles.append(i['title'])

link_titles = []
link_urls = []

for item in text:
    link_titles.append(item['title'])
    link_urls.append(item['url'])

statement10 = final_questions
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings10 = model.encode(statement10, convert_to_tensor=True)
embeddings8 = model.encode(link_titles, convert_to_tensor=True)
similarity_scores = util.pytorch_cos_sim(embeddings8, embeddings10)
ss_numpy = similarity_scores.cpu().numpy()
average_scores = np.mean(ss_numpy, axis=1)
top_5_indices = np.argsort(average_scores)[-5:]

print("Top 5 matching links:")

titles5 = []
urls5 = []

for index in top_5_indices:
    titles5.append(link_titles[index])
    urls5.append(link_urls[index])
    print(f"Title: {link_titles[index]}")
    print(f"URL: {link_urls[index]}")




Top 5 matching links:
Title: Medium Status
URL: https://medium.statuspage.io/?source=post_page-----30d2988c3b7a--------------------------------
Title: The Medium Blog
URL: https://blog.medium.com/?source=post_page-----30d2988c3b7a--------------------------------
Title: Medium Terms of Service | by Medium | Medium Policy
URL: https://policy.medium.com/medium-terms-of-service-9db0094a1e0f?source=post_page-----30d2988c3b7a--------------------------------
Title: Medium Members Can Listen To Any Medium Story With The Speechify Play Button. | Speechify
URL: https://speechify.com/medium?source=post_page-----30d2988c3b7a--------------------------------
Title: Medium Help Center
URL: https://help.medium.com/hc/en-us?source=post_page-----30d2988c3b7a--------------------------------


In [11]:
# Saving the output JSON file

url = scrapped_url
links1 = [{"url": url, "title": title} for url, title in zip(urls5, titles5)]

data_dict = [{
    "url": url,
    "questions" : final_questions,
    "relevant_links": links1
}]
with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(data_dict, f, ensure_ascii=False, indent=4)

# Validation using cosine similarity

## Using the same Llama 3.1 8B for validation

In [12]:
max_seq_length = 2048
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,),

In [13]:
question_prompt = """You are an expert quiz master who can answer any question briefly in about. Make sure the answers are clear, relevant, and suitable for the intended audience.
Answer the below given question in 500 words.

Question: {}
Output:{}"""

In [14]:
val_output = []
i = 0
for question in final_questions:
    i += 1
    chunk_new = preprocess_text(chunk)
    inputs = tokenizer([question_prompt.format(question,"",)], return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs,max_new_tokens = 64, use_cache = True)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    val_prefix = "Output:"
    val_start = outputs[0].find(val_prefix) + len(val_prefix)
    val = outputs[0][val_start:].strip()
    val_output.append(val)
    print(f"Completed processing {i}th question")

val_data = (" ").join(val_output)

Completed processing 1th question
Completed processing 2th question
Completed processing 3th question
Completed processing 4th question
Completed processing 5th question
Completed processing 6th question
Completed processing 7th question
Completed processing 8th question
Completed processing 9th question
Completed processing 10th question


In [15]:
text = data['data']
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([val_data, text])
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(f"Cosine Similarity Score: {cosine_sim[0][0]*100}")

Cosine Similarity Score: 75.57587770083603
