In [1]:
!pip install -q transformers sentence-transformers faiss-cpu pdfplumber python-pptx


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
import pdfplumber
from pptx import Presentation
from pathlib import Path
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

# HuggingFace pipelines (no API key needed)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sentiment = pipeline("sentiment-analysis")


Device set to use cuda:0
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


In [3]:
def extract_text_from_pdf(path: str) -> str:
    text = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    return "\n\n".join(text)

def extract_text_from_pptx(path: str) -> str:
    prs = Presentation(path)
    parts = []
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        if slide_text:
            parts.append("\n".join(slide_text))
    return "\n\n".join(parts)

def clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s)
    return s.strip()


In [4]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

class VectorStore:
    def __init__(self, dim: int):
        self.index = faiss.IndexFlatIP(dim)
        self.chunks = []

    def add(self, texts):
        embeddings = embed_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        faiss.normalize_L2(embeddings)
        self.index.add(embeddings)
        self.chunks.extend(texts)

    def search(self, query, k=3):
        qvec = embed_model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(qvec)
        D, I = self.index.search(qvec, k)
        return [(self.chunks[i], float(D[0][j])) for j,i in enumerate(I[0])]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [30]:
def analyze_startup(text: str):
    text = clean_text(text)

    # Summarize pitch deck
    summary = summarizer(text[:2000], max_length=130, min_length=50, do_sample=False)[0]['summary_text']

    # Sentiment (as proxy for tone)
    sentiment_result = sentiment(text)[0]

    # Heuristic scoring (simple word counts)
    scores = {
        "team": 7 if "team" in text.lower() else 5,
        "product": 7 if "product" in text.lower() else 5,
        "market": 7 if "market" in text.lower() else 5,
        "traction": 7 if "traction" in text.lower() else 5,
        "tech_moat": 7 if "patent" in text.lower() or "ai" in text.lower() else 5,
    }
    scores["overall"] = sum(scores.values())/len(scores)

    return {
        "summary": summary,
        "sentiment": sentiment_result,
        "scores": scores,
        "risks": [
            "Need validation of financial projections",
            "Market adoption risk",
            "Competition analysis required"
        ],
        "next_steps": [
            "Request detailed financial model",
            "Check customer references",
            "Assess defensibility of technology"
        ]
    }


In [31]:
def full_text_sentiment(text, chunk_size=512):
    sentiments = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        res = sentiment(chunk)[0]
        sentiments.append(res)
    # Compute overall sentiment by majority or average score
    pos_score = sum(s['score'] for s in sentiments if s['label']=="POSITIVE") / len(sentiments)
    neg_score = sum(s['score'] for s in sentiments if s['label']=="NEGATIVE") / len(sentiments)
    overall_label = "POSITIVE" if pos_score >= neg_score else "NEGATIVE"
    overall_score = max(pos_score, neg_score)
    return {"label": overall_label, "score": overall_score}

sentiment_result = full_text_sentiment(text)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
# Upload your file in Colab first (left side > Files > Upload)
path = "/content/drive/My Drive/sample_pitchdeck.pdf"
text = extract_text_from_pdf(path)

result = analyze_startup(text)
#print(result)


In [32]:
import pandas as pd
from IPython.display import display

# Table 1: Summary & Sentiment
summary_df = pd.DataFrame([
    {
        "Summary": result["summary"],
        "Sentiment": result["sentiment"]["label"],
        "Sentiment Score": result["sentiment"]["score"]
    }
])

# Table 2: Scores
scores_df = pd.DataFrame([result["scores"]])

# Table 3: Risks
risks_df = pd.DataFrame({"Risks": result["risks"]})

# Table 4: Next Steps
steps_df = pd.DataFrame({"Next Steps": result["next_steps"]})

# Display all tables
display(summary_df)
display(scores_df)
display(risks_df)
display(steps_df)

# 1. Extract text from PDF
path = "/content/drive/My Drive/sample_pitchdeck.pdf"
text = extract_text_from_pdf(path)

# 2. Analyze startup
result = analyze_startup(text)

# 3. Display results in tabular form
import pandas as pd
from IPython.display import display

summary_df = pd.DataFrame([{
    "Summary": result["summary"],
    "Sentiment": result["sentiment"]["label"],
    "Sentiment Score": result["sentiment"]["score"]
}])
scores_df = pd.DataFrame([result["scores"]])
risks_df = pd.DataFrame({"Risks": result["risks"]})
steps_df = pd.DataFrame({"Next Steps": result["next_steps"]})

display(summary_df)
display(scores_df)
display(risks_df)
display(steps_df)

Unnamed: 0,Summary,Sentiment,Sentiment Score
0,StartupX provides a mobile-first banking solut...,NEGATIVE,0.994242


Unnamed: 0,team,product,market,traction,tech_moat,overall
0,7,7,7,7,7,7.0


Unnamed: 0,Risks
0,Need validation of financial projections
1,Market adoption risk
2,Competition analysis required


Unnamed: 0,Next Steps
0,Request detailed financial model
1,Check customer references
2,Assess defensibility of technology


Unnamed: 0,Summary,Sentiment,Sentiment Score
0,StartupX provides a mobile-first banking solut...,NEGATIVE,0.99063


Unnamed: 0,team,product,market,traction,tech_moat,overall
0,7,7,7,7,7,7.0


Unnamed: 0,Risks
0,Need validation of financial projections
1,Market adoption risk
2,Competition analysis required


Unnamed: 0,Next Steps
0,Request detailed financial model
1,Check customer references
2,Assess defensibility of technology


In [23]:
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

# Path to save the PDF
file_path_positive = "sample_pitchdeck_positive.pdf"

doc = SimpleDocTemplate(file_path_positive, pagesize=A4)
styles = getSampleStyleSheet()
story = []

content_positive = {
    "Company Overview": "StartupY is a fintech company with the tagline 'Empowering Everyone'. "
                        "Our mission is to make financial services easy and accessible for everyone.",
    "Problem Statement": "Many people face difficulty accessing banking services. "
                         "We are excited to solve this by providing user-friendly solutions.",
    "Solution / Product": "StartupY provides a mobile-first banking solution with zero-fee savings accounts, "
                          "instant microloans, and AI-powered budgeting tools. Users love our intuitive interface.",
    "Market Opportunity": "The fintech market is growing rapidly and offers enormous opportunities. "
                          "Our target market is enthusiastic about adopting innovative solutions.",
    "Business Model": "We generate revenue through small transaction fees and partnerships, "
                      "ensuring our service remains affordable and widely accessible.",
    "Traction": "In 12 months, StartupY gained 120,000 happy users, achieved $1.5M revenue, "
                "and maintained a high retention rate. Feedback has been overwhelmingly positive.",
    "Competition": "While other fintech startups exist, StartupY stands out due to our customer satisfaction "
                   "and innovative AI-driven features.",
    "Team": "Founded by experienced professionals with strong backgrounds in finance and technology. "
            "The team is motivated, cohesive, and visionary.",
    "Technology / IP (optional)": "Our AI-driven credit scoring tool and budgeting assistant have been highly effective. "
                                  "We continue to innovate and protect our unique technology.",
    "Financial Projections": "Projected Year 1 revenue: $1.5M, Year 3: $15M. The business model is sustainable with low burn rate.",
    "Funding Ask": "We are raising $5M to expand into additional markets and further enhance our product. "
                   "Investors can expect strong growth and high engagement.",
    "Vision / Roadmap": "Our vision is to create a world where everyone can access financial freedom. "
                        "Next milestones include launching new features, reaching 1.5M users, and expanding internationally."
}

for section, text in content_positive.items():
    story.append(Paragraph(f"<b>{section}</b>", styles["Heading2"]))
    story.append(Paragraph(text, styles["Normal"]))
    story.append(Spacer(1, 12))

doc.build(story)

print(f"PDF generated: {file_path_positive}")


PDF generated: sample_pitchdeck_positive.pdf


In [34]:
# Install required packages
!pip install -q transformers sentence-transformers faiss-cpu pdfplumber python-pptx reportlab

# -----------------------------
# Imports
# -----------------------------
import pdfplumber
from pptx import Presentation
import re
import pandas as pd
from IPython.display import display
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline
from google.colab import drive
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

# -----------------------------
# Mount Google Drive
# -----------------------------
drive.mount('/content/drive')

# -----------------------------
# Initialize NLP models
# -----------------------------
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sentiment = pipeline("sentiment-analysis")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# -----------------------------
# Helper functions
# -----------------------------
def extract_text_from_pdf(path: str) -> str:
    text = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    return "\n\n".join(text)

def extract_text_from_pptx(path: str) -> str:
    prs = Presentation(path)
    parts = []
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        if slide_text:
            parts.append("\n".join(slide_text))
    return "\n\n".join(parts)

def clean_text(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

# -----------------------------
# Full-text sentiment analysis
# -----------------------------
def full_text_sentiment(text, chunk_size=512):
    sentiments = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        res = sentiment(chunk)[0]
        sentiments.append(res)
    pos_score = sum(s['score'] for s in sentiments if s['label']=="POSITIVE") / len(sentiments)
    neg_score = sum(s['score'] for s in sentiments if s['label']=="NEGATIVE") / len(sentiments)
    overall_label = "POSITIVE" if pos_score >= neg_score else "NEGATIVE"
    overall_score = max(pos_score, neg_score)
    return {"label": overall_label, "score": overall_score}

# -----------------------------
# Startup analysis
# -----------------------------
def analyze_startup(text: str):
    text = clean_text(text)

    # Summarize first 2000 chars
    summary = summarizer(text[:2000], max_length=130, min_length=50, do_sample=False)[0]['summary_text']

    # Full-text sentiment
    sentiment_result = full_text_sentiment(text)

    # Heuristic scoring
    scores = {
        "team": 7 if "team" in text.lower() else 5,
        "product": 7 if "product" in text.lower() else 5,
        "market": 7 if "market" in text.lower() else 5,
        "traction": 7 if "traction" in text.lower() else 5,
        "tech_moat": 7 if "patent" in text.lower() or "ai" in text.lower() else 5,
    }
    scores["overall"] = sum(scores.values()) / len(scores)

    return {
        "summary": summary,
        "sentiment": sentiment_result,
        "scores": scores,
        "risks": [
            "Need validation of financial projections",
            "Market adoption risk",
            "Competition analysis required"
        ],
        "next_steps": [
            "Request detailed financial model",
            "Check customer references",
            "Assess defensibility of technology"
        ]
    }

# -----------------------------
# List of PDFs to analyze
# -----------------------------
files = [
    "/content/drive/My Drive/sample_pitchdeck.pdf",          # Negative
    "/content/drive/My Drive/sample_pitchdeck_positive.pdf"  # Positive
]

# -----------------------------
# Process each file and display tables
# -----------------------------
for f in files:
    print(f"\n--- Analyzing: {f} ---")
    text = extract_text_from_pdf(f)
    result = analyze_startup(text)

    summary_df = pd.DataFrame([{
        "Summary": result["summary"],
        "Sentiment": result["sentiment"]["label"],
        "Sentiment Score": result["sentiment"]["score"]
    }])
    scores_df = pd.DataFrame([result["scores"]])
    risks_df = pd.DataFrame({"Risks": result["risks"]})
    steps_df = pd.DataFrame({"Next Steps": result["next_steps"]})

    display(summary_df)
    display(scores_df)
    display(risks_df)
    display(steps_df)

# -----------------------------
# Optional: Generate a sample positive PDF
# -----------------------------
'''file_path_positive = "sample_pitchdeck_positive_generated.pdf"
doc = SimpleDocTemplate(file_path_positive, pagesize=A4)
styles = getSampleStyleSheet()
story = []

content_positive = {
    "Company Overview": "StartupY is a fintech company with the tagline 'Empowering Everyone'. "
                        "Our mission is to make financial services easy and accessible for everyone.",
    "Problem Statement": "Many people face difficulty accessing banking services. "
                         "We are excited to solve this by providing user-friendly solutions.",
    "Solution / Product": "StartupY provides a mobile-first banking solution with zero-fee savings accounts, "
                          "instant microloans, and AI-powered budgeting tools. Users love our intuitive interface.",
    "Market Opportunity": "The fintech market is growing rapidly and offers enormous opportunities. "
                          "Our target market is enthusiastic about adopting innovative solutions.",
    "Business Model": "We generate revenue through small transaction fees and partnerships, "
                      "ensuring our service remains affordable and widely accessible.",
    "Traction": "In 12 months, StartupY gained 120,000 happy users, achieved $1.5M revenue, "
                "and maintained a high retention rate. Feedback has been overwhelmingly positive.",
    "Competition": "While other fintech startups exist, StartupY stands out due to our customer satisfaction "
                   "and innovative AI-driven features.",
    "Team": "Founded by experienced professionals with strong backgrounds in finance and technology. "
            "The team is motivated, cohesive, and visionary.",
    "Technology / IP (optional)": "Our AI-driven credit scoring tool and budgeting assistant have been highly effective. "
                                  "We continue to innovate and protect our unique technology.",
    "Financial Projections": "Projected Year 1 revenue: $1.5M, Year 3: $15M. The business model is sustainable with low burn rate.",
    "Funding Ask": "We are raising $5M to expand into additional markets and further enhance our product. "
                   "Investors can expect strong growth and high engagement.",
    "Vision / Roadmap": "Our vision is to create a world where everyone can access financial freedom. "
                        "Next milestones include launching new features, reaching 1.5M users, and expanding internationally."
}'''

for section, text_block in content_positive.items():
    story.append(Paragraph(f"<b>{section}</b>", styles["Heading2"]))
    story.append(Paragraph(text_block, styles["Normal"]))
    story.append(Spacer(1, 12))

doc.build(story)
print(f"\nSample positive PDF generated: {file_path_positive}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Device set to use cuda:0
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0



--- Analyzing: /content/drive/My Drive/sample_pitchdeck.pdf ---


Unnamed: 0,Summary,Sentiment,Sentiment Score
0,StartupX provides a mobile-first banking solut...,NEGATIVE,0.493111


Unnamed: 0,team,product,market,traction,tech_moat,overall
0,7,7,7,7,7,7.0


Unnamed: 0,Risks
0,Need validation of financial projections
1,Market adoption risk
2,Competition analysis required


Unnamed: 0,Next Steps
0,Request detailed financial model
1,Check customer references
2,Assess defensibility of technology



--- Analyzing: /content/drive/My Drive/sample_pitchdeck_positive.pdf ---


Unnamed: 0,Summary,Sentiment,Sentiment Score
0,StartupY is a fintech company with the tagline...,POSITIVE,0.985873


Unnamed: 0,team,product,market,traction,tech_moat,overall
0,7,7,7,7,7,7.0


Unnamed: 0,Risks
0,Need validation of financial projections
1,Market adoption risk
2,Competition analysis required


Unnamed: 0,Next Steps
0,Request detailed financial model
1,Check customer references
2,Assess defensibility of technology



Sample positive PDF generated: sample_pitchdeck_positive_generated.pdf


In [20]:
!pip install reportlab


Collecting reportlab
  Downloading reportlab-4.4.4-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.4-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m1.8/2.0 MB[0m [31m53.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.4
