In [None]:
# Install everything (quietly)
!pip install -q langchain langchain-community faiss-cpu transformers sentence-transformers


# Imports
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline


# Step 1: Load a text file
file_path = "/content/smm.txt"  # upload your .txt file manually
with open(file_path, "r") as f:
    print(f.read()[:500])  # preview first few lines

# Step 2: Load and split
loader = TextLoader(file_path)
documents = loader.load()
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = splitter.split_documents(documents)

# Step 3: Create embeddings + FAISS DB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embedding_model)
retriever = db.as_retriever()

# Step 4: Summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
llm = HuggingFacePipeline(pipeline=summarizer)

# Step 5: RAG chain
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Step 6: Ask it to summarize
query = "Summarize the document"
summary = rag_chain.run(query)
print("\n📄 Summary:\n", summary)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/2.5 MB[0m [31m20.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m43.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.3/423.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hClimate change refers to significant, long-term changes in the global climate. The Earth is currently experiencing rapid climate change, largely due 

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=summarizer)
  summary = rag_chain.run(query)



📄 Summary:
 Climate change refers to significant, long-term changes in the global climate. The Earth is currently experiencing rapid climate change, largely due to human activities such as the burning of fossil fuels, deforestation, and industrial processes. Immediate and collective action is essential to mitigate the impacts and ensure a sustainable future.


**Private Document Summarizer (RAG + LangChain):**

Developed a Retrieval-Augmented Generation (RAG) based system to summarize private text documents using open-source language models.

Tools: Python, LangChain, HuggingFace Transformers, FAISS, Gradio.

Project Highlights:

Implemented RAG architecture using LangChain with a HuggingFace summarization model (BART).

Used SentenceTransformers for semantic document embedding and FAISS for efficient vector search.

Enabled document chunking and summarization without relying on paid APIs like OpenAI.

Built an interactive Gradio web interface for users to upload .txt files and receive concise summaries.

Ensured privacy by keeping all processing local and API-free.

In [None]:
!pip install -q langchain langchain-community faiss-cpu transformers sentence-transformers gradio


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.5/46.5 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gradio as gr

def process_txt(file):
    try:
        from langchain_community.document_loaders import TextLoader
        from langchain.text_splitter import CharacterTextSplitter
        from langchain_community.embeddings import HuggingFaceEmbeddings
        from langchain_community.vectorstores import FAISS
        from langchain.llms import HuggingFacePipeline
        from langchain.chains import RetrievalQA
        from transformers import pipeline

        # Load summarizer and pipeline
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        llm = HuggingFacePipeline(pipeline=summarizer)

        # Load and process file
        loader = TextLoader(file.name)
        documents = loader.load()

        splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        chunks = splitter.split_documents(documents)

        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        db = FAISS.from_documents(chunks, embeddings)
        retriever = db.as_retriever()

        chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

        summary = chain.run("Summarize the document")
        return summary
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Gradio Interface
gr.Interface(
    fn=process_txt,
    inputs=gr.File(label="📄 Upload your .txt file"),
    outputs=gr.Textbox(label="🧠 Summary"),
    title="LangChain RAG Summarizer (Free & Local)",
    description="Uses sentence-transformers + BART to summarize uploaded documents. No OpenAI key needed."
).launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b7696407bf5eac2747.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


