<a href="https://colab.research.google.com/github/kusumamahadev/capstoneproject1/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pandas nltk scikit-learn gensim -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # Added to resolve LookupError

file_path = "/content/Shark Tank Brasil dataset.csv"

try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"ERROR: File not found at '{file_path}'")
    raise

# Print DataFrame columns to diagnose the KeyError
print("DataFrame columns:", df.columns)

df_part1 = df[['Business Description', 'Got Deal']].copy()
df_part1.dropna(subset=['Business Description', 'Got Deal'], inplace=True)

le = LabelEncoder()
df_part1['got_deal_numeric'] = le.fit_transform(df_part1['Got Deal'])

ps = PorterStemmer()
stop_words = set(stopwords.words('portuguese'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return " ".join(cleaned_tokens)

df_part1['cleaned_description'] = df_part1['Business Description'].apply(clean_text)

X = df_part1['cleaned_description']
y = df_part1['got_deal_numeric']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

rf_model_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = rf_model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
report_tfidf = classification_report(y_test, y_pred_tfidf, target_names=le.classes_.astype(str))

print(f"--- Project 1 Results (TF-IDF + Random Forest) ---")
print(f"Accuracy: {accuracy_tfidf * 100:.2f}%")
print("\nClassification Report:")
print(report_tfidf)
print("--------------------------------------------------")

sentences = [doc.split() for doc in X]

w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=0)

def create_document_vector(doc, model):
    tokens = doc.split()
    word_vectors = []

    for word in tokens:
        if word in model.wv:
            word_vectors.append(model.wv[word])

    if not word_vectors:
        return np.zeros(model.vector_size)

    return np.mean(word_vectors, axis=0)

X_train_w2v = np.array([create_document_vector(doc, w2v_model) for doc in X_train])
X_test_w2v = np.array([create_document_vector(doc, w2v_model) for doc in X_test])

rf_model_w2v = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_w2v.fit(X_train_w2v, y_train)

y_pred_w2v = rf_model_w2v.predict(X_test_w2v)
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
report_w2v = classification_report(y_test, y_pred_w2v, target_names=le.classes_.astype(str))

print(f"--- Project 2 Results (Word2Vec + Random Forest) ---")
print(f"Accuracy: {accuracy_w2v * 100:.2f}%")
print("\nClassification Report:")
print(report_w2v)
print("--------------------------------------------------")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


DataFrame columns: Index(['Season Number', 'Startup Name', 'Episode Number', 'Pitch Number',
       'Season Start', 'Season End', 'Original Air Date', 'Industry',
       'Business Description', 'Company Website', 'Entrepreneur Names',
       'Original Ask Amount', 'Original Offered Equity', 'Valuation Requested',
       'Got Deal', 'Total Deal Amount', 'Total Deal Equity', 'Deal Valuation',
       'Loan', 'Deal Has Conditions', 'Number of Sharks in Deal',
       'Investment Amount Per Shark', 'Equity Per Shark', 'Mentoring',
       'Invested Shark Names'],
      dtype='object')
--- Project 1 Results (TF-IDF + Random Forest) ---
Accuracy: 53.66%

Classification Report:
              precision    recall  f1-score   support

         0.0       0.17      0.07      0.10        15
         1.0       0.60      0.81      0.69        26

    accuracy                           0.54        41
   macro avg       0.38      0.44      0.39        41
weighted avg       0.44      0.54      0.47        

In [None]:
!pip install sentence-transformers -q

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

print("\n--- Starting Part 3: Pitch Clustering (from Project 3) ---")

# --- 1. Load Model ---
# We use the 'all-MiniLM-L6-v2' model from your project 3
model = SentenceTransformer('all-MiniLM-L6-v2')
print("SentenceTransformer model loaded.")

# --- 2. Prepare Data for Clustering ---
# We'll use the original, non-cleaned 'Business Description' for better semantic meaning.
# Let's re-load the original data to ensure we have it clean.
file_path = "/content/Shark Tank Brasil dataset.csv"
try:
    df_cluster = pd.read_csv(file_path)
    # Corrected column name from 'pitch_description' to 'Business Description'
    df_cluster.dropna(subset=['Business Description'], inplace=True)
    # Reset the index after dropping rows to ensure a contiguous index for encoding
    df_cluster.reset_index(drop=True, inplace=True)
    print(f"Loaded {len(df_cluster)} non-null pitch descriptions for clustering.")
except FileNotFoundError:
    print(f"ERROR: File not found at '{file_path}'")
    raise

# --- 3. Generate Embeddings ---
# This will take a minute or two.
print("Creating embeddings for pitch descriptions...")
# We use .encode() from the sentence-transformer library
pitch_embeddings = model.encode(df_cluster['Business Description'], show_progress_bar=True)
print(f"Embeddings created. Shape: {pitch_embeddings.shape}")

# --- 4. K-Means Clustering ---
# We'll group the pitches into 6 categories. You can change this number.
num_clusters = 6
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)

print(f"Running K-Means clustering to find {num_clusters} clusters...")
df_cluster['cluster'] = kmeans.fit_predict(pitch_embeddings)
print("Clustering complete.")

# --- 5. Analyze Clusters ---
print("\n--- Project 3 Results (Clustering) ---")
print(f"Pitches per cluster:\n{df_cluster['cluster'].value_counts().sort_index()}")
print("\n--- Example Pitches from each Cluster ---")

for i in range(num_clusters):
    print(f"\nCluster {i}:")
    # Get 2 random samples from this cluster
    # Corrected column name from 'pitch_description' to 'Business Description'
    samples = df_cluster[df_cluster['cluster'] == i]['Business Description'].sample(2, random_state=1)
    for sample in samples:
        # Print a shortened version
        print(f"  - {sample[:150]}...")
print("---------------------------------------------")


--- Starting Part 3: Pitch Clustering (from Project 3) ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer model loaded.
Loaded 213 non-null pitch descriptions for clustering.
Creating embeddings for pitch descriptions...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Embeddings created. Shape: (213, 384)
Running K-Means clustering to find 6 clusters...
Clustering complete.

--- Project 3 Results (Clustering) ---
Pitches per cluster:
cluster
0    19
1    38
2    43
3    48
4    35
5    30
Name: count, dtype: int64

--- Example Pitches from each Cluster ---

Cluster 0:
  - Game Studio...
  - methodology to teach violin in an accessible way...

Cluster 1:
  - menswear brand motorcycle helmet with integrated Bluetooth, speakers, microphone...
  - hologram technology for events and advertising...

Cluster 2:
  - edible, flavored ice cream cone designed to enhance the dessert experience...
  - Gourmet popcorn brand...

Cluster 3:
  - greeting cards with QR code...
  - Vehicle financing...

Cluster 4:
  - subscription box and e-commerce brand for high-quality, professional tanning...
  - Decor paints...

Cluster 5:
  - litter box for cats that automatically separates solid waste, controls odor...
  - biotechnology company developing innovative molecular d

In [None]:
!pip install langchain chromadb llama-cpp-python sentence-transformers langchain-community -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m19.1 MB/s[0m eta [36m0:00:

In [None]:
!wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf -O llama-2-7b-chat.Q4_K_M.gguf

--2025-11-16 04:33:57--  https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 3.171.171.65, 3.171.171.6, 3.171.171.128, ...
Connecting to huggingface.co (huggingface.co)|3.171.171.65|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/64f60811b8cc49b414fe5cdf/33a9a621592e6c74390d6314d64113bcd520b334b85946c88d48126ae89a1dd5?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251116%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251116T043357Z&X-Amz-Expires=3600&X-Amz-Signature=78f15b54812a39cda9b1225e29fecfab29de6d21e1f6b53ce2337e2b6d31b88c&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27llama-2-7b-chat.Q4_K_M.gguf%3B+filename%3D%22llama-2-7b-chat.Q4_K_M.gguf%22%3B&x-id=GetObject&Expires=1763271237&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvb

In [None]:
import pandas as pd
from langchain.docstore.document import Document
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

print("\n--- Starting Part 4: RAG Q&A System (from Project 4) ---")

# --- 1. Load the LLM (from Project 4) ---
# This points to the file we just downloaded
model_path = "llama-2-7b-chat.Q4_K_M.gguf"

print("Loading Llama-2-7B-Chat-GGUF model...")
# n_gpu_layers=1 enables GPU. n_ctx=2048 is the context window.
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=1,
    n_ctx=2048,
    n_batch=512,
    verbose=False
)
print("LLM loaded successfully.")

# --- 2. Load and Format Data into Documents ---
file_path = "/content/Shark Tank Brasil dataset.csv"
try:
    df_rag = pd.read_csv(file_path)
    df_rag.dropna(subset=['Business Description', 'Startup Name', 'Original Ask Amount'], inplace=True)
    df_rag = df_rag.reset_index()
except FileNotFoundError:
    print(f"ERROR: File not found at '{file_path}'")
    raise

print(f"Loading and formatting {len(df_rag)} pitches into documents...")

documents = []
for _, row in df_rag.iterrows():
    # Create a text string for each row
    content = f"Business Name: {row['Startup Name']}\n" \
              f"Description: {row['Business Description']}\n" \
              f"Requested Amount: {row['Original Ask Amount']}\n" \
              f"Equity Offered: {row['Original Offered Equity']}%\n" \
              f"Got Deal: {row['Got Deal']}"

    # Store other columns as metadata
    metadata = {
        "source_id": row['index'],
        "business_name": str(row['Startup Name']),
        "got_deal": bool(row['Got Deal'])
    }

    doc = Document(page_content=content, metadata=metadata)
    documents.append(doc)

print("Document formatting complete.")

# --- 3. Create Embeddings and Vector Store ---
print("Creating embeddings and Chroma vector store...")
# We re-use the same model from Part 3 for efficiency
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

vector_store = Chroma.from_documents(
    documents,
    embeddings,
    persist_directory="chroma_db_sharktank"
)
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
print("Vector store created.")

# --- 4. Create RAG Chain ---
# We create a prompt template to guide the LLM
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}
Helpful Answer:"""

QA_PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": QA_PROMPT},
    return_source_documents=True
)
print("RAG Q&A chain created.")

# --- 5. Run Example Queries ---
print("\n--- Project 4 Results (RAG) ---")

# Query 1
query1 = "What is the business 'Pinch'? Describe their pitch."
print(f"Query: {query1}")
result1 = qa_chain({"query": query1})
print(f"Answer: {result1['result'].strip()}")

print("\n")

# Query 2
query2 = "Find a business related to food that got a deal. What was their name and requested amount?"
print(f"Query: {query2}")
result2 = qa_chain({"query": query2})
print(f"Answer: {result2['result'].strip()}")

print("---------------------------------")


--- Starting Part 4: RAG Q&A System (from Project 4) ---
Loading Llama-2-7B-Chat-GGUF model...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


LLM loaded successfully.
Loading and formatting 82 pitches into documents...
Document formatting complete.
Creating embeddings and Chroma vector store...


  embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')


Vector store created.
RAG Q&A chain created.

--- Project 4 Results (RAG) ---
Query: What is the business 'Pinch'? Describe their pitch.


  result1 = qa_chain({"query": query1})


Answer: Pinch is a business that provides gourmet popcorn, and their pitch is focused on the quality of their ingredients and the unique flavors they offer.
Unhelpful Answer: I don't know.


Query: Find a business related to food that got a deal. What was their name and requested amount?
Answer: MeiMei, requested amount was 350000.0
---------------------------------


In [None]:
import pandas as pd
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json

print("\n--- Starting Part 5 (FIXED): Prompt Engineering ---")

# --- 1. Load the LLM (with max_new_tokens) ---
print("Reloading Llama-2-7B-Chat-GGUF model with higher token limit...")
model_path = "llama-2-7b-chat.Q4_K_M.gguf"

llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=1,
    n_ctx=2048,
    n_batch=512,
    verbose=False,
    temperature=0.3,
    max_new_tokens=1024  # <--- THIS IS THE FIX FOR TRUNCATION
)
print("LLM loaded.")


# --- 2. Define the (Improved) Prompt Template ---
# This prompt is more strict, telling it to start/end with brackets.
prompt_template_json = """
[INST]
You are a business analyst. Analyze the following business pitch.
Your task is to perform all of the following analyses.
Your entire response must be ONLY a single, valid JSON object.
Your response must start with {{ and end with }}.

1.  **sentiment_analysis**: Classify the likely shark sentiment ("Positive", "Negative", "Neutral").
2.  **key_aspects**: Extract the "Problem", "Solution", and "Target Market".
3.  **generated_feedback**: Write 1-2 sentences of constructive feedback for the entrepreneur.

Here is the pitch description:
---
{pitch_description}
---

[/INST]
"""

# --- 3. Create the Prompt and LLM Chain ---
prompt = PromptTemplate(template=prompt_template_json, input_variables=["pitch_description"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
print("Prompt engineering chain created.")

# --- 4. Get Sample Pitches to Analyze ---
try:
    if 'df_rag' not in locals():
        file_path = "/content/Shark Tank Brasil dataset.csv"
        df_rag = pd.read_csv(file_path)
        df_rag.dropna(subset=['Business Description'], inplace=True)

    sample_pitches = df_rag['Business Description'].sample(2, random_state=42).tolist()
except Exception as e:
    print(f"Error getting sample pitches: {e}")
    # Add dummy pitches if loading failed, just to test the chain
    sample_pitches = [
        "ready-to-grow mini-gardens in a box",
        "Healthy, frozen baby food/meals for children"
    ]

print("\n--- Project 5 Results (FIXED) ---")

# --- 5. Run the Chain on Sample Pitches ---
for i, pitch in enumerate(sample_pitches):
    print(f"\n--- Analyzing Pitch {i+1} ---")
    print(f"Pitch: {pitch[:150]}...")

    response = llm_chain.invoke({"pitch_description": pitch})
    # Use .invoke() which returns a dict, and get the text output
    response_text = response.get('text', '').strip()

    print("\nLLM Output (Raw String):")
    print(response_text)

    try:
        # This parsing logic is good. It finds the first { and last }
        json_start = response_text.find('{')
        json_end = response_text.rfind('}') + 1

        if json_start == -1 or json_end == 0:
            raise ValueError("No JSON object found in response")

        json_string = response_text[json_start:json_end]
        parsed_json = json.loads(json_string)

        print("\nLLM Output (Parsed JSON):")
        print(json.dumps(parsed_json, indent=2))

        print(f"\nGenerated Shark Feedback:")
        print(parsed_json.get('generated_feedback', 'N/A'))

    except Exception as e:
        print(f"\nCould not parse JSON output: {e}")

print("\n-------------------------------------------")
print("All 5 project parts are now implemented.")
print("-------------------------------------------")


--- Starting Part 5 (FIXED): Prompt Engineering ---
Reloading Llama-2-7B-Chat-GGUF model with higher token limit...


                max_new_tokens was transferred to model_kwargs.
                Please confirm that max_new_tokens is what you intended.
  if (await self.run_code(code, result,  async_=asy)):
llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


LLM loaded.
Prompt engineering chain created.

--- Project 5 Results (FIXED) ---

--- Analyzing Pitch 1 ---
Pitch: ready-to-grow mini-gardens in a box...


  llm_chain = LLMChain(prompt=prompt, llm=llm)



LLM Output (Raw String):
{
"sentiment_analysis": {
"shark sentiment": "Positive"
},
"key_aspects": [
"Problem": "Lack of access to fresh, healthy produce in urban areas",
"Solution": "Ready-to-grow mini-gardens in a box provide an easy and convenient way for people to grow their own food at home",
"Target Market": "Urban dwellers with limited space and/or lack of access to sunlight"
],
"generated_feedback": "Great idea! You've identified a real problem in urban areas and provided a practical solution. Consider highlighting the potential for community engagement and education, as well as the potential for expansion into other markets (e.g. office spaces, schools). Also, consider including some data or statistics to support your claims about the demand for fresh produce in urban areas."
}

Could not parse JSON output: Expecting ',' delimiter: line 6 column 10 (char 85)

--- Analyzing Pitch 2 ---
Pitch: Healthy, frozen baby food/meals for children...

LLM Output (Raw String):
{
"sentimen

In [None]:
!pip install streamlit -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
from langchain.docstore.document import Document
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# --- 1. SETTINGS & MODEL LOADING (CACHED) ---
# This is the most important step:
# @st.cache_resource tells Streamlit to run this function ONCE
# and keep the result (the LLM, the RAG chain) in memory.
# This prevents reloading the 4.3GB model every time you ask a question.

@st.cache_resource
def load_models():
    # --- Load LLM ---
    model_path = "llama-2-7b-chat.Q4_K_M.gguf"
    llm = LlamaCpp(
        model_path=model_path,
        n_gpu_layers=1, n_ctx=2048, n_batch=512,
        verbose=False, temperature=0.3, max_new_tokens=1024
    )

    # --- Load Embeddings ---
    embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

    # --- Load Data & Create Vector Store (RAG) ---
    file_path = "/content/Shark Tank Brasil dataset.csv"
    df_rag = pd.read_csv(file_path)
    df_rag.dropna(subset=['pitch_description', 'business_name', 'requested_amount'], inplace=True)
    df_rag = df_rag.reset_index()

    documents = []
    for _, row in df_rag.iterrows():
        content = f"Business Name: {row['business_name']}\n" \
                  f"Description: {row['pitch_description']}\n" \
                  f"Requested Amount: {row['requested_amount']}\n" \
                  f"Equity Offered: {row['equity_offered_percent']}%\n" \
                  f"Got Deal: {row['got_deal']}"
        doc = Document(page_content=content, metadata={"source_id": row['index']})
        documents.append(doc)

    vector_store = Chroma.from_documents(documents, embeddings)
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})

    # --- Create RAG Chain ---
    rag_prompt_template = """Use the following context to answer the question.
    Context: {context}
    Question: {question}
    Answer:"""
    QA_PROMPT = PromptTemplate(template=rag_prompt_template, input_variables=["context", "question"])
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=retriever,
        chain_type_kwargs={"prompt": QA_PROMPT}, return_source_documents=True
    )

    # --- Create Prompt Engineering Chain ---
    prompt_template_json = """
    [INST]
    You are a business analyst. Analyze the following business pitch.
    Your entire response must be ONLY a single, valid JSON object.
    Your response must start with { and end with }.

    1.  **sentiment_analysis**: Classify the likely shark sentiment ("Positive", "Negative", "Neutral").
    2.  **key_aspects**: Extract the "Problem", "Solution", and "Target Market".
    3.  **generated_feedback**: Write 1-2 sentences of constructive feedback.

    Pitch:
    ---
    {pitch_description}
    ---
    [/INST]
    """
    json_prompt = PromptTemplate(template=prompt_template_json, input_variables=["pitch_description"])
    json_chain = LLMChain(prompt=json_prompt, llm=llm)

    return qa_chain, json_chain

# --- 2. BUILD THE USER INTERFACE ---
st.title("🦈 Shark Tank Brasil AI Analyst")
st.markdown("This app combines your RAG (Part 4) and Prompt Engineering (Part 5) projects.")

# Load the models (this will run once and then be cached)
with st.spinner("Warming up the AI models... (This may take a minute)"):
    qa_chain, json_chain = load_models()

# --- Tab 1: RAG Q&A System (Project 4) ---
tab1, tab2 = st.tabs(["Chat with Data (Project 4)", "Analyze a Pitch (Project 5)"])

with tab1:
    st.header("Chat with the Shark Tank Dataset")
    rag_query = st.text_input("Ask a question about any pitch:", placeholder="e.g., What is the business 'Pinch'?")

    if rag_query:
        with st.spinner("Searching for the answer..."):
            result = qa_chain({"query": rag_query})
            st.write("### Answer")
            st.write(result['result'].strip())

            st.write("### Sources Used (Retrieved Documents)")
            for doc in result['source_documents']:
                st.info(doc.page_content)

# --- Tab 2: Prompt Engineering (Project 5) ---
with tab2:
    st.header("Analyze a New Pitch")
    prompt_query = st.text_area("Paste a pitch description here:", placeholder="e.g., A new social media app for dogs that...")

    if prompt_query:
        with st.spinner("Analyzing pitch..."):
            response = json_chain.invoke({"pitch_description": prompt_query})
            response_text = response.get('text', '').strip()

            st.write("### Analysis Result (JSON)")

            # Find and display the JSON
            try:
                json_start = response_text.find('{')
                json_end = response_text.rfind('}') + 1
                json_string = response_text[json_start:json_end]
                st.json(json_string) # Streamlit's built-in JSON viewer!
            except:
                st.error("The LLM did not return valid JSON this time. Try again.")
                st.write(response_text)

Overwriting app.py


In [None]:
!streamlit run app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.26.92.3:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m


In [1]:
# --- 1. Install ngrok ---
!pip install pyngrok -q

# --- 2. Stop any old streamlit processes ---
# This is to make sure we start fresh
!killall streamlit

# --- 3. Set your authtoken ---
# !!! PASTE YOUR TOKEN FROM https://dashboard.ngrok.com/get-started/your-authtoken
authtoken = "35XyNjEcgtVChoHqVu5Nrw98gWz_3ACjJjJLqnyj4yhtf95PX"

# --- 4. Run ngrok and Streamlit ---
import os
from pyngrok import ngrok

# Set the authtoken
os.environ["NGROK_AUTHTOKEN"] = authtoken
ngrok.kill() # Kill any existing tunnels

# Start streamlit in the background
os.system("streamlit run app.py &")

# Start an ngrok tunnel to the streamlit port (8501)
public_url = ngrok.connect(8501)

print("-------------------------------------------------")
print(f"Your new, stable app URL is: {public_url}")
print("-------------------------------------------------")

streamlit: no process found
-------------------------------------------------
Your new, stable app URL is: NgrokTunnel: "https://daylily-prius-annamaria.ngrok-free.dev" -> "http://localhost:8501"
-------------------------------------------------
