In [19]:
!pip install langchain

!pip install transformers
!pip install sentence-transformers
!pip install pandas
!pip install plotly
!pip install gradio
!pip install langchain_community





In [20]:
import pandas as pd
import plotly.express as px
import gradio as gr
import io

from sentence_transformers import SentenceTransformer
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from transformers import pipeline



In [21]:
# Load embeddings model
embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def embed_columns(df):
    embeddings = {}
    for col in df.columns:
        text = col + " " + " ".join(df[col].astype(str).head(5))
        embeddings[col] = embed_model.encode(text)
    return embeddings





In [22]:
schemas = [
    ResponseSchema(name="chart_type", description="Type of chart: line, bar, scatter"),
    ResponseSchema(name="x_column", description="Column for X-axis"),
    ResponseSchema(name="y_column", description="Column for Y-axis"),
    ResponseSchema(name="insight", description="Textual insight about the data")
]

output_parser = StructuredOutputParser.from_response_schemas(schemas)

prompt = PromptTemplate(
    template="""
You are a data assistant. Analyze the dataset columns: {columns}.
User query: {query}.
Return **strict JSON only** with keys: chart_type, x_column, y_column, insight.
Example:
{{
  "chart_type": "line",
  "x_column": "Month",
  "y_column": "Revenue",
  "insight": "Revenue peaked in May."
}}
""",
    input_variables=["columns", "query"],
    output_parser=output_parser
)




In [23]:
# Hugging Face text-generation model
hf_pipeline = pipeline("text-generation", model="google/flan-t5-small", max_length=200)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

llm_chain = LLMChain(prompt=prompt, llm=llm)




Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'DogeForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'Exaone4ForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForC

In [24]:
def generate_chart(df, query):
    columns_text = ", ".join(df.columns)

    # Use invoke instead of run (new LangChain)
    chain_output = llm_chain.invoke({"columns": columns_text, "query": query})

    # Safely parse output
    try:
        parsed = output_parser.parse(chain_output)
    except:
        parsed = {
            "chart_type": "line",
            "x_column": df.columns[0],
            "y_column": df.columns[1],
            "insight": "Insight could not be generated, showing default chart."
        }

    chart_type = parsed['chart_type']
    x_col = parsed['x_column']
    y_col = parsed['y_column']
    insight = parsed['insight']

    # Generate Plotly chart
    if chart_type == "line":
        fig = px.line(df, x=x_col, y=y_col, title=insight)
    elif chart_type == "bar":
        fig = px.bar(df, x=x_col, y=y_col, title=insight)
    else:
        fig = px.scatter(df, x=x_col, y=y_col, title=insight)

    return fig, insight




In [26]:
def chatbot_interface(file, query):
    import io
    import pandas as pd

    # Handle different file types
    try:
        # If file has .read(), read as bytes
        file_bytes = file.read()
        df = pd.read_csv(io.BytesIO(file_bytes))
    except AttributeError:
        # If file is a NamedString or str
        try:
            df = pd.read_csv(file.name)  # temp file path
        except:
            # fallback: read as string
            df = pd.read_csv(io.StringIO(file))

    fig, insight = generate_chart(df, query)
    return fig, insight


In [27]:


gr.Interface(
    fn=chatbot_interface,
    inputs=[
        gr.File(file_types=[".csv"]),
        gr.Textbox(lines=2, placeholder="Ask something about your dataset")
    ],
    outputs=[gr.Plot(), gr.Textbox(label="AI Insight")],
    title="AI Data Visualization Chatbot (LangChain + HF Embeddings)"
).launch(share=True, debug=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://168cc65c271583ac80.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7862 <> https://168cc65c271583ac80.gradio.live


