### Loading Data from Text and CSV file

In [4]:
from langchain.document_loaders import TextLoader
loader = TextLoader("../data/AI_impact.txt")
data = loader.load()

data

[Document(metadata={'source': '../data/AI_impact.txt'}, page_content="Transforming industries with AI: Lessons from China’s journey\n\nArtificial intelligence (AI) is rapidly reshaping industries worldwide, with China emerging as a key player in demonstrating how AI can drive industrial transformation at scale.\n\nWith a growing AI industry valued at over $70 billion and a dynamic ecosystem of over 4,300 companies, China provides insights into how nations can align strategy, innovation, and ecosystem development to harness AI's transformative potential.\n\nThe newly released whitepaper Industries in the Intelligent Age - Blueprint to Action: China's Path to AI-powered Industry Transformation from the World Economic Forum's AI Governance Alliance highlights how the country uses AI to address industry-specific challenges while illustrating the complexities of responsibly scaling AI innovations.\n\nWhile China’s model may not be universally applicable, it offers valuable lessons on foster

In [5]:
print(len(data))
print(type(loader))

1
<class 'langchain_community.document_loaders.text.TextLoader'>


In [6]:
from langchain.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path="../data/sample.csv")
data = loader.load()

data

[Document(metadata={'source': '../data/sample.csv', 'row': 0}, page_content='text: Meditation apps are gaining popularity among students\ncategory: Health'),
 Document(metadata={'source': '../data/sample.csv', 'row': 1}, page_content='text: Regular morning walks can help reduce stress levels\ncategory: Health'),
 Document(metadata={'source': '../data/sample.csv', 'row': 2}, page_content='text: Doctors recommend balanced diets for heart patients\ncategory: Health'),
 Document(metadata={'source': '../data/sample.csv', 'row': 3}, page_content='text: Yoga retreats are attracting working professionals\ncategory: Health'),
 Document(metadata={'source': '../data/sample.csv', 'row': 4}, page_content='text: A new skincare brand launched its eco-friendly range\ncategory: Fashion'),
 Document(metadata={'source': '../data/sample.csv', 'row': 5}, page_content='text: Oversized blazers are trending this winter\ncategory: Fashion'),
 Document(metadata={'source': '../data/sample.csv', 'row': 6}, page_c

In [7]:
data[0] # first document

Document(metadata={'source': '../data/sample.csv', 'row': 0}, page_content='text: Meditation apps are gaining popularity among students\ncategory: Health')

In [8]:
data[0].page_content

'text: Meditation apps are gaining popularity among students\ncategory: Health'

In [9]:
data[0].metadata

{'source': '../data/sample.csv', 'row': 0}

### UnstructuredURLLoader

In [10]:
# Unstructured URL loader
from langchain.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(
    # URL other than wikipedia
    urls=[
        # "https://www.mckinsey.com/industries/life-sciences/our-insights/scaling-gen-ai-in-the-life-sciences-industry",
        "https://www.businessnewsdaily.com/9402-artificial-intelligence-business-trends.html"
        ]
)

data_url = loader.load()
data_url

[Document(metadata={'source': 'https://www.businessnewsdaily.com/9402-artificial-intelligence-business-trends.html'}, page_content="Menu\n\nStart\n\nOur Recommendations\n\nBest Small Business Loans for 2025\n\nBusinessloans.com Review\n\nBiz2Credit Review\n\nSBG Funding Review\n\nRapid Finance Review\n\nOur Guides\n\n30 Great Business Ideas for Motivated Entrepreneurs\n\nStartup Costs: How Much Cash Will You Need?\n\nHow to Get a Bank Loan for Your Small Business\n\nArticles of Incorporation: What New Business Owners Should Know\n\nHow to Choose the Best Legal Structure for Your Business\n\nSmall Business Resources\n\nBusiness Ideas\n\nBusiness Plans\n\nStartup Basics\n\nStartup Funding\n\nFranchising\n\nSuccess Stories\n\nEntrepreneurs\n\nGrow\n\nOur Recommendations\n\nThe Best Credit Card Processors of 2025\n\nClover Credit Card Processing Review\n\nMerchant One Review\n\nStax Review\n\nOur Guides\n\nHow to Conduct a Market Analysis for Your Business\n\nLocal Marketing Strategies for

In [11]:
data_url[0].page_content

"Menu\n\nStart\n\nOur Recommendations\n\nBest Small Business Loans for 2025\n\nBusinessloans.com Review\n\nBiz2Credit Review\n\nSBG Funding Review\n\nRapid Finance Review\n\nOur Guides\n\n30 Great Business Ideas for Motivated Entrepreneurs\n\nStartup Costs: How Much Cash Will You Need?\n\nHow to Get a Bank Loan for Your Small Business\n\nArticles of Incorporation: What New Business Owners Should Know\n\nHow to Choose the Best Legal Structure for Your Business\n\nSmall Business Resources\n\nBusiness Ideas\n\nBusiness Plans\n\nStartup Basics\n\nStartup Funding\n\nFranchising\n\nSuccess Stories\n\nEntrepreneurs\n\nGrow\n\nOur Recommendations\n\nThe Best Credit Card Processors of 2025\n\nClover Credit Card Processing Review\n\nMerchant One Review\n\nStax Review\n\nOur Guides\n\nHow to Conduct a Market Analysis for Your Business\n\nLocal Marketing Strategies for Success\n\nTips for Hiring a Marketing Company\n\nBenefits of CRM Systems\n\n10 Employee Recruitment Strategies for Success\n\nSma

In [12]:
data_url[0].metadata

{'source': 'https://www.businessnewsdaily.com/9402-artificial-intelligence-business-trends.html'}

### Text Splitters
To split the text into small chunks so that there is no issue with context window size of the llm


#### CharacterTextSplitter

In [18]:
text = """AI’s industry-specific impact
China’s approach to AI emphasizes practical applications tailored to the unique needs of various industries.

By integrating AI technologies such as digital twins, predictive maintenance and generative AI, industries such as manufacturing, healthcare, transportation, retail and energy are witnessing transformative advancements.

These include optimizing production processes, enhancing diagnostics and patient care, enabling autonomous transport systems, personalizing consumer experiences and improving renewable energy management.

This sector-focused innovation exemplifies how AI can be applied at scale to drive efficiency, sustainability, and new business and operational models. The emphasis on tailoring AI solutions to specific needs showcases an approach other regions could adopt to maximize impact and overcome barriers to adoption.
"""

In [19]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 100,
    chunk_overlap  = 0,
    length_function = len
)

#### RecursiveTextSplitter

In [28]:
chunks = text_splitter.split_text(text)
len(chunks)

Created a chunk of size 108, which is longer than the specified 100
Created a chunk of size 219, which is longer than the specified 100
Created a chunk of size 203, which is longer than the specified 100


5

In [29]:
for chunk in chunks:
    print(len(chunk))

# As you can see, we are splitting the text into chunks based on the \n separator but it is exceeding the chunk size of 100 characters.
# So, we can use RecursiveTextSplitter which will split the text into smaller chunks based on the list of separators provided.


29
108
219
203
310


#### RecursiveTextSplitter

In [30]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", ".", "!", "?", ",", " ", ""],
    chunk_size = 100,
    chunk_overlap = 0,
    length_function = len
)



In [31]:
chunks = text_splitter.split_text(text)
len(chunks)

17

In [32]:
for chunk in chunks:
    print(len(chunk))


29
96
10
1
94
62
62
1
85
39
78
1
93
57
100
58
1


In [33]:
# Understanding how RecursiveCharacterTextSplitter works by playing with different separators 
first_split = text.split("\n\n")[0]
first_split

'AI’s industry-specific impact\nChina’s approach to AI emphasizes practical applications tailored to the unique needs of various industries.'

In [34]:
len(first_split)

138

In [35]:
second_split = text.split("\n")[0]
second_split

'AI’s industry-specific impact'

In [36]:
len(second_split)

29

### API Keys

In [37]:
from dotenv import load_dotenv
import os
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY


In [38]:
# embedding
from langchain_huggingface import HuggingFaceEmbeddings 

import torch

device = torch.device("cuda" if torch.cuda.is_available else 'cpu')
print(f"loading device...", {device})

def download_hf_embeddings():
    model = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name = model,
        # model_kwargs = {"device": device }
    )
    return embeddings

embeddings = download_hf_embeddings()
embeddings

loading device... {device(type='cuda')}


HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

### Retrieval

In [75]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
)

docs = text_splitter.split_documents(data_url)
len(docs)

55

In [78]:
docs[0]

Document(metadata={'source': 'https://www.businessnewsdaily.com/9402-artificial-intelligence-business-trends.html'}, page_content='Menu\n\nStart\n\nOur Recommendations\n\nBest Small Business Loans for 2025\n\nBusinessloans.com Review\n\nBiz2Credit Review\n\nSBG Funding Review\n\nRapid Finance Review\n\nOur Guides\n\n30 Great Business Ideas for Motivated Entrepreneurs\n\nStartup Costs: How Much Cash Will You Need?\n\nHow to Get a Bank Loan for Your Small Business\n\nArticles of Incorporation: What New Business Owners Should Know\n\nHow to Choose the Best Legal Structure for Your Business\n\nSmall Business Resources\n\nBusiness Ideas\n\nBusiness Plans')

In [79]:
# Create embeddings for these chunks of text and store them in a vector store
from langchain.vectorstores import FAISS

# create the vector store
vector_idx = FAISS.from_documents(docs, embeddings)

vector_idx

<langchain_community.vectorstores.faiss.FAISS at 0x263c31b71f0>

In [80]:
# Save the FAISS index to disk using pickle
import pickle
import os 

filepath = "faiss_index.pkl"
with open(filepath, "wb") as f:
    pickle.dump(vector_idx, f)


if os.path.exists(filepath):
    with open(filepath, "rb") as f:
        vector_idx = pickle.load(f)
    print("FAISS index loaded from disk.")

FAISS index loaded from disk.


In [83]:
# Now, we can use this vector store to perform similarity search and retrieve relevant documents based on a query.

import langchain
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQAWithSourcesChain

# chat_model = ChatOpenAI(model="gpt-3.5-turbo")
llama_chat_model = ChatGroq(
    model="llama-3.1-8b-instant", temperature=0.0, max_tokens=500, api_key=GROQ_API_KEY
)

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llama_chat_model,
    retriever=vector_idx.as_retriever()
)

# query = "How AI is transforming business"
query = "Who said that Artificial intelligence is kind of the second coming of software"

langchain.debug = True

chain({"question": query}, return_only_outputs=True)


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Who said that Artificial intelligence is kind of the second coming of software"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who said that Artificial intelligence is kind of the second coming of software",
  "summaries": "Content: “Artificial intelligence is kind of the second coming of software,” explained Amir Husain, founder of ML company SparkCognition. “It’s a form of software that makes decisions on its own, that’s able to act even in situations not foreseen by the programmers. Artificial intelligence has a wider latitude of decision-making ability [than] traditional software.”\n\nAI’s abilities 

{'answer': 'Amir Husain, the founder of ML company SparkCognition, said that Artificial intelligence is kind of the second coming of software.\n',
 'sources': 'https://www.businessnewsdaily.com/9402-artificial-intelligence-business-trends.html'}