In [1]:
%%capture
%pip install transformers safetensors langchain chromadb faiss-cpu nltk bitsandbytes pandas sklearn tiktoken sentence-transformers trl peft
%pip install torch --ignore-installed
%pip install accelerate 
%pip install transformers datasets



In [2]:
%%capture
import pandas as pd
import torch
from torch import bfloat16
from transformers import AutoTokenizer, AutoModel,pipeline,TrainingArguments
from langchain.text_splitter import TokenTextSplitter
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import Chroma
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, LLMChain
from transformers import pipeline, set_seed
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from IPython.display import Markdown, display
from transformers import GPT2LMHeadModel, GPT2Tokenizer









In [3]:
# Set the path of the csv file. 
dataset_path = r"C:\Users\48512\Desktop\RAG\medium.csv"     # Path here
offload_path = r"C:\Users\48512\Desktop\RAG\Offloader"      # Path here



In [4]:
df = pd.read_csv(dataset_path)

In [5]:
%%capture

# Load the pre-trained transformer model and its corresponding tokenizer from Hugging Face's Transformers library.
model_id = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Check if the tokenizer has a padding token set; if not, add '[PAD]' as the padding token.
# This is necessary for models that require input lengths to be uniform.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer)) 
tokenizer.padding_side = "right"
model_ = AutoModel.from_pretrained(model_id)




In [6]:
# Initialize a DataFrameLoader to load data from a DataFrame. 
titles = DataFrameLoader(df, page_content_column="Title")
document = titles.load()

# Create a TokenTextSplitter instance configured to split text into chunks.
# 'chunk_size' specifies the number of tokens each chunk should contain, and
# 'chunk_overlap' specifies the number of tokens that subsequent chunks will overlap.
splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=75)
splitted_texts = splitter.split_documents(document)






In [7]:
# Initialize an embedding model with a specified transformer model from Hugging Face.
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a Chroma database from split text documents using the specified embedding model.
chroma_database = Chroma.from_documents(splitted_texts,
                                      embedding_model,
                                      persist_directory = 'chroma_db')

retriever = chroma_database.as_retriever()

In [8]:
def embed_text(texts, batch_size=128):

    all_embeddings = []
    # Iterate over the list of texts in batches of size batch_size.
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        with torch.no_grad():# Disable gradients for efficiency.
            # Tokenize and pad texts, converting them to PyTorch tensors.
            tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
            embeddings = model_(**tokens).last_hidden_state[:, 0, :]
            all_embeddings.append(embeddings.cpu().numpy())
    # Combine all batch embeddings into a single array and return.
    return np.vstack(all_embeddings)

In [9]:
texts = df['Title'].tolist()

embeddings = embed_text(texts)
# Initialize a PCA object from the scikit-learn library with 128 target components.
pca = PCA(n_components=128)
# Fit the PCA model to the embeddings and transform them to the reduced space.
reduced_embeddings = pca.fit_transform(embeddings)

In [10]:
# Get the number of dimensions in the reduced embeddings.
dimension = reduced_embeddings.shape[1]
# Create a quantizer index for L2 distance. This will be used to partition the dataset.
quantizer = faiss.IndexFlatL2(dimension)
# Create an index using an Inverted File system with the quantizer. This index is optimized for L2 distance, contains 20 partitions, and uses the quantizer for the coarse quantization.
index = faiss.IndexIVFFlat(quantizer, dimension, 20, faiss.METRIC_L2)
# Train the index with the reduced embeddings.
index.train(reduced_embeddings)
index.add(reduced_embeddings)

In [11]:



def search_articles(query, k=3):
    # Embed the query text
    query_vec = embed_text([query])
    # Transform the query embedding to the reduced dimensionality space using PCA.
    query_vec = pca.transform(query_vec) 
     # Perform a search on the FAISS index
    distances, indices = index.search(query_vec, k)
     # Loop through each found index and distance and Retrieve the title and text of the corresponding article using the index
    print(f'Query: {query}')
    for idx, dist in zip(indices[0], distances[0]):
        title = df.iloc[idx]['Title']
        text = df.iloc[idx]['Text']
       # Limit the displayed text  
        limited_text = ' '.join(text.split()[:300]) + "..."
        print(f"\nTitle: {title}\nText: {limited_text}\nDistance: {dist}\n")
        return text

In [12]:
query = "What is the impact of climate change on polar bears?"
search_articles(query)

Query: What is the impact of climate change on polar bears?

Title: Time Series Analysis & Climate Change
Text: Wrangling of CO₂ emissions data This section will tackle the wrangling of our Carbon Dioxide emissions data. We will use some of the same techniques used above, as well as looking at some new ones: Slicing and Searching Useful functions Familiar techniques From our DataFrame, we will use only the row representing the CO₂ emissions for the entire world. Like before, we will create a new DataFrame that uses a DateTime index — and then use the raw data to populate it. Creating a DataFrame — and populating it — with world emissions data Resulting emissions DataFrame Slicing and Searching DateTime indexes make for convenient slicing of data, let’s select all of our data after the year 2011: e[e.index.year>2011] Slice of emissions data after the year 2011 (notice the missing data) (CREDIT: Author on Jupyter Notebook) Hmm. There seems to be a few NaN’s towards the end of our data — 

"Wrangling of CO₂ emissions data\n\nThis section will tackle the wrangling of our Carbon Dioxide emissions data. We will use some of the same techniques used above, as well as looking at some new ones:\n\nSlicing and Searching\n\nUseful functions\n\nFamiliar techniques\n\nFrom our DataFrame, we will use only the row representing the CO₂ emissions for the entire world. Like before, we will create a new DataFrame that uses a DateTime index — and then use the raw data to populate it.\n\nCreating a DataFrame — and populating it — with world emissions data\n\nResulting emissions DataFrame\n\nSlicing and Searching\n\nDateTime indexes make for convenient slicing of data, let’s select all of our data after the year 2011:\n\ne[e.index.year>2011]\n\nSlice of emissions data after the year 2011 (notice the missing data) (CREDIT: Author on Jupyter Notebook)\n\nHmm. There seems to be a few NaN’s towards the end of our data — lets use Panda’s fillna method to deal with this.\n\ne.fillna(method='ffill

In [13]:

model_gpt = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer_gpt = GPT2Tokenizer.from_pretrained("gpt2")
model_embedding = SentenceTransformer('all-MiniLM-L6-v2')

documents = texts
doc_embeddings = model_embedding.encode(documents)
pca = PCA(n_components=5)
reduced_embeddings = pca.fit_transform(doc_embeddings)
index = faiss.IndexFlatL2(5)
index.add(reduced_embeddings.astype(np.float32))





In [18]:
def retrieve_context(query, k=1):
    query_embedding = model_embedding.encode([query])
    query_embedding = pca.transform(query_embedding)
    _, indices = index.search(query_embedding.astype(np.float32), k)
    return " ".join(documents[i] for i in indices[0])

def generate_response(query):
    context = retrieve_context(query)
    input_ids = tokenizer_gpt.encode(context + query, return_tensors="pt")
    generated_ids = model_gpt.generate( input_ids,
    do_sample=True,
    max_length=150,
    top_k=50,
    top_p=0.95,
    num_return_sequences=3)
    response = tokenizer_gpt.decode(generated_ids[0], skip_special_tokens=True)
    return response

In [21]:
question = "What is the Data Science?"
answer = generate_response(question)
print( answer)

3 Things Every Aspiring Data Scientist Should Know Before Their First JobWhat is the Data Science? This article examines data science careers and provides the general guidelines and details on data science jobs and careers for data scientists and their organizations.1

Research and analysis: A Data Scientist's Career The first part of this article is about the research and analysis required by the study of data science. 2 Then there is the following points: How will this data science career be different from other career fields and how will this be affected by different disciplines? How is the study about data science related to your field/industry relevant to you? Does the methodology for data science related to this career matter to you? Finally, if you are not familiar with the different data
