<a href="https://colab.research.google.com/github/imranajec/Synthetic-Data-Generation/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.41.3 trl==0.4.7
!pip install transformers
!pip install --upgrade --quiet langchain langchain-community langchain-openai wikipedia tiktoken
!pip install --upgrade --quiet faiss-cpu
!pip install sentence-transformers
!pip install prettytable




In [None]:
import os
import torch
import random
import pandas as pd
import numpy as np
import faiss
import pickle
import json
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from sentence_transformers import SentenceTransformer, util
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from prettytable import PrettyTable
from pydantic import BaseModel
from typing import Optional, List


In [None]:
# Set Hugging Face API token
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'


In [None]:
# Define necessary variables
use_4bit = True
bnb_4bit_quant_type = "nf4"
compute_dtype = "float16"
use_nested_quant = True
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

# Load base model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)
device_map = {"": 0}

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    use_auth_token='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'
)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token='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')
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Load documents and prepare context
raw_documents = WikipediaLoader(query="India_national_cricket_team").load()
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_documents)

In [None]:
# Load Sentence-Transformers model
model_sentence = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")




In [None]:
# Generate embeddings for the split documents
content = [doc.page_content for doc in documents]
embeddings = [model_sentence.encode(doc) for doc in content]
embeddings_np = np.array(embeddings)


In [None]:
# Initialize and add embeddings to Faiss index
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings_np)
faiss.write_index(index, "document_vectors.index")
with open("documents.pkl", "wb") as f:
    pickle.dump(documents, f)

In [None]:
# Load Faiss index and documents
index = faiss.read_index("document_vectors.index")
with open("documents.pkl", "rb") as f:
    documents = pickle.load(f)

In [None]:

def generate_query(contexts):
    prompt_content = f"""I want you to act as a copywriter. Based on the given context,
    which is a list of strings, please generate a JSON object
    with an `input` key. The `input` should be a single question or
    statement, no longer than 20 words, that can be addressed by the given context.

    contexts:
    {contexts}

    Please provide the JSON object below:"""
    input_ids = tokenizer(prompt_content, return_tensors='pt').input_ids.to("cuda")
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    start_index = generated_text.find('{')
    end_index = generated_text.rfind('}')
    json_object_str = generated_text[start_index:end_index + 1]
    try:
        query = json.loads(json_object_str)
    except json.JSONDecodeError:
        query = {}
    return query.get("input", "")

In [None]:
multi_context_template = """
I want you to rewrite the given `input` so that it requires readers to use information from all elements in `Context`.
1. `Input` should require information from all `Context` elements.
2. `Rewritten Input` must be concise and fully answerable from `Context`.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""

reasoning_template = """
I want you to rewrite the given `input` so that it explicitly requests multi-step reasoning.
1. `Rewritten Input` should require multiple logical connections or inferences.
2. `Rewritten Input` should be concise and understandable.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` must be fully answerable from `Context`.
5. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""

hypothetical_scenario_template = """
I want you to rewrite the given `input` to incorporate a hypothetical or speculative scenario.
1. `Rewritten Input` should encourage applying knowledge from `Context` to deduce outcomes.
2. `Rewritten Input` should be concise and understandable.
3. Do not use phrases like 'based on the provided context.'
4. `Rewritten Input` must be fully answerable from `Context`.
5. `Rewritten Input` should not exceed 15 words.
Context: {context}
Input: {original_input}
Rewritten Input:
"""

In [None]:
evolution_templates = [multi_context_template, reasoning_template, hypothetical_scenario_template]
num_evolution_steps = 3

In [None]:
def evolve_query(original_input, context, steps):
    current_input = original_input
    for _ in range(steps):
        chosen_template = random.choice(evolution_templates)
        evolved_prompt = chosen_template.replace("{context}", str(context)).replace("{original_input}", current_input)
        inputs = tokenizer(evolved_prompt, return_tensors='pt', padding=True, truncation=True, max_length=512)
        input_ids = inputs.input_ids.to("cuda")
        attention_mask = inputs.attention_mask.to("cuda")
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=1024,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        start_index = generated_text.find('Rewritten Input:')
        if start_index != -1:
            current_input = generated_text[start_index:].split('\n')[0].replace('Rewritten Input:', '').strip()
        else:
            current_input = generated_text.strip()
    return current_input


In [None]:
expected_output_template = """
I want you to generate an answer for the given `input`. This answer has to be factually aligned to the provided context.
Context: {context}
Input: {evolved_query}
Answer:
"""

In [None]:
def generate_answer(context, evolved_query):
    prompt = expected_output_template.replace("{context}", str(context)).replace("{evolved_query}", evolved_query)
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=1024,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    start_index = generated_text.find('Answer:')
    if start_index != -1:
        answer = generated_text[start_index:].split('\n')[0].replace('Answer:', '').strip()
    else:
        answer = generated_text.strip()
    return answer


In [None]:
def select_random_chunk_and_similar_chunks(documents, embeddings_np, threshold=0.7):
    # Randomly select one document chunk
    selected_chunk_index = random.randint(0, len(documents) - 1)
    selected_chunk_embedding = embeddings_np[selected_chunk_index]

    # Compute cosine similarity between the selected chunk and all chunks
    similarity_scores = cosine_similarity([selected_chunk_embedding], embeddings_np)[0]

    # Retrieve similar chunks based on a threshold
    similar_chunk_indices = [i for i, score in enumerate(similarity_scores) if score >= threshold]
    similar_chunks = [documents[i] for i in similar_chunk_indices]
    context = [chunk.page_content for chunk in similar_chunks]

    return context

In [None]:
class SyntheticData(BaseModel):
    query: str
    expected_output: Optional[str]
    context: List[str]

synthetic_dataset = []

for _ in range(5):
    # Step 1: Select random chunk and similar chunks
    contexts = select_random_chunk_and_similar_chunks(documents, embeddings_np)

    # Step 2: Generate the initial query
    query = generate_query(contexts)

    # Step 3: Evolve the query
    evolved_query = evolve_query(query, contexts, num_evolution_steps)

    # Step 4: Generate the expected output
    expected_output = generate_answer(contexts, evolved_query)

    # Step 5: Append the synthetic data to the dataset
    synthetic_data = SyntheticData(
        query=evolved_query,
        expected_output=expected_output,
        context=contexts
    )
    synthetic_dataset.append(synthetic_data)

df = pd.DataFrame([data.dict() for data in synthetic_dataset])
df.to_csv('synthetic_dataset.csv', index=False)

print("Dataset created and saved as 'synthetic_dataset.csv'")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end gene

Dataset created and saved as 'synthetic_dataset.csv'


In [None]:
# Display the DataFrame using PrettyTable
table = PrettyTable()
table.field_names = df.columns

for row in df.itertuples(index=False, name=None):
    table.add_row(row)

print(table)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------