In [1]:
import numpy as np
import pandas as pd
from transformers import pipeline

# Define scoring weights
weights = {
    'team_size': 0.2,
    'market_opportunity': 0.2,
    'innovation': 0.15,
    'business_model': 0.15,
    'scalability': 0.1,
    'traction': 0.2
}

# Define the scoring criteria
criteria = {
    'team_size': {
        1: "Single founder, no team, no experience",
        2: "Team of 2+, little to no experience",
        3: "Complementary team with some founders having significant work experience",
        4: "Serial Entrepreneur(s) with no exits",
        5: "Serial Entrepreneur(s) with multiple exits"
    },
    'market_opportunity': {
        1: "No market need, unclear problem that they are solving. Poor customer identification.",
        2: "Product solves a problem with a midsize market, well served by competitors",
        3: "Product solves a problem with a large market, well served by some competitors",
        4: "Product solves a problem and has an attractive niche in a large market. Good value proposition to customers. Clear customer identification with unique positioning.",
        5: "Product solves a problem and has an attractive niche in a large market. Very strong value proposition to customers. Clear customer identification with unique positioning in mostly untapped market (more than or equals to USD 1 billion)."
    },
    'innovation': {
        1: "No to Low innovation - localization of proven business models without change",
        2: "Low innovation - localization of proven business model adapted to some markets",
        3: "Some innovation - significant improvement of existing solution",
        4: "Some unique IP, patents or data (pending patent)",
        5: "Very strong innovation (IP / data)"
    },
    'business_model': {
        1: "Business model is impossible to realize",
        2: "Hints at possible business model, financial projections need to be worked on",
        3: "Business model explained, but not validated. Financial projections may or may not be available",
        4: "Business model explained and first validation / tests are successful with real customers.",
        5: "Good revenue model / business model is defined and has been validated with large number of customers"
    },
    'scalability': {
        1: "Solution is very manual / manpower heavy - no chance at scalability",
        2: "Solution has the potential to scale beyond 1 city / small country, but has some issues to scale",
        3: "Solution has no issues to scale globally or within home country but scaling has not started",
        4: "Great potential to scale globally and has started to establish good networks in these countries but scaling has not started yet.",
        5: "Has become well established in 1 City / small country. Has not started to scale anywhere else. No issue to scale otherwise"
    },
    'traction': {
        1: "Very little traction (social media engagement, landing page collecting potential customer info)",
        2: "Prototype testing with initial customers (Beta testing)",
        3: "Generating first revenues with paying customers. Has clear milestones and KPIs",
        4: "Generating moderate revenue with paying customers. Has developed a lot of interest / relationships with key market participants or have significant traffic and engagement with customers.",
        5: "Sustainable business (significant profit)"
    }
}

# Define sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")

# Function to calculate weighted score
def calculate_weighted_score(row):
    score = (
        row['team_size'] * weights['team_size'] +
        row['market_opportunity'] * weights['market_opportunity'] +
        row['innovation'] * weights['innovation'] +
        row['business_model'] * weights['business_model'] +
        row['scalability'] * weights['scalability'] +
        row['traction'] * weights['traction']
    )
    return score

# Function to generate paragraph for each sample
def generate_paragraph(row):
    paragraph = (
        f"Team Size: {criteria['team_size'][row['team_size']]}\n"
        f"Market Opportunity / Problem to be solved: {criteria['market_opportunity'][row['market_opportunity']]}\n"
        f"Innovation: {criteria['innovation'][row['innovation']]}\n"
        f"Business Model: {criteria['business_model'][row['business_model']]}\n"
        f"Scalability: {criteria['scalability'][row['scalability']]}\n"
        f"Traction: {criteria['traction'][row['traction']]}\n"
    )
    return paragraph

# Example data (replace with your actual data)
num_samples = 2
startup_data = {
    'team_size': np.random.randint(1, 6, size=num_samples),
    'market_opportunity': np.random.randint(1, 6, size=num_samples),
    'innovation': np.random.randint(1, 6, size=num_samples),
    'business_model': np.random.randint(1, 6, size=num_samples),
    'scalability': np.random.randint(1, 6, size=num_samples),
    'traction': np.random.randint(1, 6, size=num_samples),
}

# Convert the data to a DataFrame
startup_df = pd.DataFrame(startup_data)

# Calculate scores and sentiment analysis for each startup
results = []
for i in range(num_samples):
    row = startup_df.iloc[i]
    weighted_score = calculate_weighted_score(row)
    eligibility = 'ELIGIBLE' if weighted_score >= 3.0 else 'NOT ELIGIBLE'
    paragraph = generate_paragraph(row)
    sentiment = sentiment_analysis(paragraph)[0]  # Analyze sentiment of the paragraph
    sentiment_label = sentiment['label']
    sentiment_score = sentiment['score']
    results.append({
        'text': paragraph,
        'label': eligibility,
        'sentiment_label': sentiment_label,
        'sentiment_score': sentiment_score,
        'weighted_score': weighted_score
    })

# Create a DataFrame with the results
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df[['text', 'label']].to_csv('synthetic_data_paragraphs.csv', index=False)

# Display the first few rows with paragraphs, eligibility, and sentiment analysis
print(results_df[['text', 'label', 'sentiment_label', 'sentiment_score']].head())

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

                                                text         label  \
0  Team Size: Single founder, no team, no experie...  NOT ELIGIBLE   
1  Team Size: Single founder, no team, no experie...      ELIGIBLE   

  sentiment_label  sentiment_score  
0        POSITIVE         0.998593  
1        POSITIVE         0.998511  


In [2]:
import pandas as pd
from transformers import pipeline

In [3]:
startup_df = pd.read_csv('/content/synthetic_data_paragraphs.csv')

In [4]:
def transform_paragraph(row):
    paragraph = row['text']
    lines = paragraph.split('\n')
    team_size = lines[0].split(': ')[1]
    market_opportunity = lines[1].split(': ')[1]
    innovation = lines[2].split(': ')[1]
    business_model = lines[3].split(': ')[1]
    scalability = lines[4].split(': ')[1]
    traction = lines[5].split(': ')[1]

    transformed_paragraph = f"Company Name: Startup {row.name + 1}\n"
    transformed_paragraph += "Gender: MALE\n"  # Assuming all samples are MALE for now
    transformed_paragraph += "Company Description: [Company description placeholder]\n"
    transformed_paragraph += "Company Website: [Company website placeholder]\n"
    transformed_paragraph += "Job Titles: Chief Executive Officer (CEO)\n"
    transformed_paragraph += "Business Model: [Business model placeholder]\n"
    transformed_paragraph += f"Revenue: {row['label']}\n"
    transformed_paragraph += "Profit: Not generating profit yet\n"
    transformed_paragraph += "Total External Funding: 0\n"
    transformed_paragraph += "Notable Investors: [Investor names placeholder]\n"
    transformed_paragraph += "Competition Region: North America\n"
    transformed_paragraph += "\n"
    transformed_paragraph += f"Team Size: {team_size}\n"
    transformed_paragraph += f"Market Opportunity / Problem to be solved: {market_opportunity}\n"
    transformed_paragraph += f"Innovation: {innovation}\n"
    transformed_paragraph += f"Business Model: {business_model}\n"
    transformed_paragraph += f"Scalability: {scalability}\n"
    transformed_paragraph += f"Traction: {traction}\n"

    return transformed_paragraph

In [5]:
transformed_paragraphs = startup_df.apply(transform_paragraph, axis=1)

In [6]:
with open('transformed_paragraphs.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(transformed_paragraphs))

In [8]:
import re
import pandas as pd
from transformers import pipeline

# Define scoring weights
weights = {
    'team_size': 0.2,
    'market_opportunity': 0.2,
    'innovation': 0.15,
    'business_model': 0.15,
    'scalability': 0.1,
    'traction': 0.2
}

# Define the scoring criteria
criteria = {
    'team_size': {
        1: "Single founder, no team, no experience",
        2: "Team of 2+, little to no experience",
        3: "Complementary team with some founders having significant work experience",
        4: "Serial Entrepreneur(s) with no exits",
        5: "Serial Entrepreneur(s) with multiple exits"
    },
    'market_opportunity': {
        1: "No market need, unclear problem that they are solving. Poor customer identification.",
        2: "Product solves a problem with a midsize market, well served by competitors",
        3: "Product solves a problem with a large market, well served by some competitors",
        4: "Product solves a problem and has an attractive niche in a large market. Good value proposition to customers. Clear customer identification with unique positioning.",
        5: "Product solves a problem and has an attractive niche in a large market. Very strong value proposition to customers. Clear customer identification with unique positioning in mostly untapped market (more than or equals to USD 1 billion)."
    },
    'innovation': {
        1: "No to Low innovation - localization of proven business models without change",
        2: "Low innovation - localization of proven business model adapted to some markets",
        3: "Some innovation - significant improvement of existing solution",
        4: "Some unique IP, patents or data (pending patent)",
        5: "Very strong innovation (IP / data)"
    },
    'business_model': {
        1: "Business model is impossible to realize",
        2: "Hints at possible business model, financial projections need to be worked on",
        3: "Business model explained, but not validated. Financial projections may or may not be available",
        4: "Business model explained and first validation / tests are successful with real customers.",
        5: "Good revenue model / business model is defined and has been validated with large number of customers"
    },
    'scalability': {
        1: "Solution is very manual / manpower heavy - no chance at scalability",
        2: "Solution has the potential to scale beyond 1 city / small country, but has some issues to scale",
        3: "Solution has no issues to scale globally or within home country but scaling has not started",
        4: "Great potential to scale globally and has started to establish good networks in these countries but scaling has not started yet.",
        5: "Has become well established in 1 City / small country. Has not started to scale anywhere else. No issue to scale otherwise"
    },
    'traction': {
        1: "Very little traction (social media engagement, landing page collecting potential customer info)",
        2: "Prototype testing with initial customers (Beta testing)",
        3: "Generating first revenues with paying customers. Has clear milestones and KPIs",
        4: "Generating moderate revenue with paying customers. Has developed a lot of interest / relationships with key market participants or have significant traffic and engagement with customers.",
        5: "Sustainable business (significant profit)"
    }
}

# Define sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")

# Function to calculate weighted score
def calculate_weighted_score(row):
    score = (
        row['team_size'] * weights['team_size'] +
        row['market_opportunity'] * weights['market_opportunity'] +
        row['innovation'] * weights['innovation'] +
        row['business_model'] * weights['business_model'] +
        row['scalability'] * weights['scalability'] +
        row['traction'] * weights['traction']
    )
    return score

# Function to extract information from the input text
def extract_information(text):
    team_size = re.search(r'Team Size: (.*)', text).group(1)
    market_opportunity = re.search(r'Market Opportunity / Problem to be solved: (.*)', text).group(1)
    innovation = re.search(r'Innovation: (.*)', text).group(1)
    business_model = re.search(r'Business Model: (.*)', text).group(1)
    scalability = re.search(r'Scalability: (.*)', text).group(1)
    traction = re.search(r'Traction: (.*)', text).group(1)

    return {
        'team_size': criteria['team_size'][team_size],
        'market_opportunity': criteria['market_opportunity'][market_opportunity],
        'innovation': criteria['innovation'][innovation],
        'business_model': criteria['business_model'][business_model],
        'scalability': criteria['scalability'][scalability],
        'traction': criteria['traction'][traction]
    }

# Function to determine eligibility
def determine_eligibility(text):
    info = extract_information(text)
    weighted_score = calculate_weighted_score(info)
    eligibility = 'ELIGIBLE' if weighted_score >= 3.0 else 'NOT ELIGIBLE'
    paragraph = '\n'.join([f"{k}: {v}" for k, v in info.items()])
    sentiment = sentiment_analysis(paragraph)[0]  # Analyze sentiment of the paragraph
    sentiment_label = sentiment['label']
    sentiment_score = sentiment['score']

    return {
        'text': text,
        'label': eligibility,
        'sentiment_label': sentiment_label,
        'sentiment_score': sentiment_score,
        'weighted_score': weighted_score
    }

# Example input text
input_text = """Company Name: Startup 1
Gender: FEMALE
Company Description: Dropbox lets you save and access all your files and photos in one place for easy sharing. Easily share files & access team content from your computer, mobile or any web browser.
Company Website: https://www.dropbox.com/
Job Titles: Chief Operating Officer (COO)/ Head of Operations
Business Model: nan
Revenue: $50,001 - $250,000 (USD)
Profit: Not generating profit yet
Total External Funding: 4000000
Notable Investors: Y Combinator, Sequioa Capital
Competition Region: North America

Team Size: Complementary team with some founders having significant work experience
Market Opportunity / Problem to be solved: Product solves a problem and has an attractive niche in a large market. Very strong value proposition to customers. Clear customer identification with unique positioning in mostly untapped market (more than or equals to USD 1 billion).
Innovation: Some unique IP, patents or data (pending patent)
Business Model: Good revenue model / business model is defined and has been validated with large number of customers
Scalability: Solution has no issues to scale globally or within home country but scaling has not started
Traction: Prototype testing with initial customers (Beta testing)
"""

# Determine eligibility and print the result
result = determine_eligibility(input_text)
print(result)

KeyError: 'Complementary team with some founders having significant work experience'

In [16]:
import re
import pandas as pd
from transformers import pipeline

# Define scoring weights
weights = {
    'team_size': 0.2,
    'market_opportunity': 0.2,
    'innovation': 0.15,
    'business_model': 0.15,
    'scalability': 0.1,
    'traction': 0.2
}

# Define the scoring criteria
criteria = {
    'team_size': {
        "Single founder, no team, no experience": 1,
        "Team of 2+, little to no experience": 2,
        "Complementary team with some founders having significant work experience": 3,
        "Serial Entrepreneur(s) with no exits": 4,
        "Serial Entrepreneur(s) with multiple exits": 5
    },
    'market_opportunity': {
        "No market need, unclear problem that they are solving. Poor customer identification.": 1,
        "Product solves a problem with a midsize market, well served by competitors": 2,
        "Product solves a problem with a large market, well served by some competitors": 3,
        "Product solves a problem and has an attractive niche in a large market. Good value proposition to customers. Clear customer identification with unique positioning.": 4,
        "Product solves a problem and has an attractive niche in a large market. Very strong value proposition to customers. Clear customer identification with unique positioning in mostly untapped market (more than or equals to USD 1 billion).": 5
    },
    'innovation': {
        "No to Low innovation - localization of proven business models without change": 1,
        "Low innovation - localization of proven business model adapted to some markets": 2,
        "Some innovation - significant improvement of existing solution": 3,
        "Some unique IP, patents or data (pending patent)": 4,
        "Very strong innovation (IP / data)": 5
    },
    'business_model': {
        "Business model is impossible to realize": 1,
        "Hints at possible business model, financial projections need to be worked on": 2,
        "Business model explained, but not validated. Financial projections may or may not be available": 3,
        "Business model explained and first validation / tests are successful with real customers.": 4,
        "Good revenue model / business model is defined and has been validated with large number of customers": 5
    },
    'scalability': {
        "Solution is very manual / manpower heavy - no chance at scalability": 1,
        "Solution has the potential to scale beyond 1 city / small country, but has some issues to scale": 2,
        "Solution has no issues to scale globally or within home country but scaling has not started": 3,
        "Great potential to scale globally and has started to establish good networks in these countries but scaling has not started yet.": 4,
        "Has become well established in 1 City / small country. Has not started to scale anywhere else. No issue to scale otherwise": 5
    },
    'traction': {
        "Very little traction (social media engagement, landing page collecting potential customer info)": 1,
        "Prototype testing with initial customers (Beta testing)": 2,
        "Generating first revenues with paying customers. Has clear milestones and KPIs": 3,
        "Generating moderate revenue with paying customers. Has developed a lot of interest / relationships with key market participants or have significant traffic and engagement with customers.": 4,
        "Sustainable business (significant profit)": 5
    }
}

# Define sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")

# Function to calculate weighted score
def calculate_weighted_score(row):
    score = (
        row['team_size'] * weights['team_size'] +
        row['market_opportunity'] * weights['market_opportunity'] +
        row['innovation'] * weights['innovation'] +
        row['business_model'] * weights['business_model'] +
        row['scalability'] * weights['scalability'] +
        row['traction'] * weights['traction']
    )
    return score

# Function to extract information from the input text
def extract_information(text):
    def safe_extract(pattern, text):
        match = re.search(pattern, text)
        return match.group(1) if match else 'nan'

    team_size = safe_extract(r'Team Size: (.*)', text)
    market_opportunity = safe_extract(r'Market Opportunity / Problem to be solved: (.*)', text)
    innovation = safe_extract(r'Innovation: (.*)', text)
    business_model = safe_extract(r'Business Model: (.*)', text)
    scalability = safe_extract(r'Scalability: (.*)', text)
    traction = safe_extract(r'Traction: (.*)', text)

    return {
        'team_size': criteria['team_size'].get(team_size, 0),
        'market_opportunity': criteria['market_opportunity'].get(market_opportunity, 0),
        'innovation': criteria['innovation'].get(innovation, 0),
        'business_model': criteria['business_model'].get(business_model, 0),
        'scalability': criteria['scalability'].get(scalability, 0),
        'traction': criteria['traction'].get(traction, 0)
    }

# Function to determine eligibility
def determine_eligibility(text):
    info = extract_information(text)
    weighted_score = calculate_weighted_score(info)
    eligibility = 'ELIGIBLE' if weighted_score >= 2.9 else 'NOT ELIGIBLE'
    paragraph = '\n'.join([f"{k}: {v}" for k, v in info.items()])
    sentiment = sentiment_analysis(paragraph)[0]
    sentiment_label = sentiment['label']
    sentiment_score = sentiment['score']

    return {
        'text': text,
        'label': eligibility,
        'sentiment_label': sentiment_label,
        'sentiment_score': sentiment_score,
        'weighted_score': weighted_score
    }

# Example input text
input_text = """Company Name: Startup 1
Gender: FEMALE
Company Description: Dropbox lets you save and access all your files and photos in one place for easy sharing. Easily share files & access team content from your computer, mobile or any web browser.
Company Website: https://www.dropbox.com/
Job Titles: Chief Operating Officer (COO)/ Head of Operations
Business Model: nan
Revenue: $50,001 - $250,000 (USD)
Profit: Not generating profit yet
Total External Funding: 4000000
Notable Investors: Y Combinator, Sequioa Capital
Competition Region: North America

Team Size: Complementary team with some founders having significant work experience
Market Opportunity / Problem to be solved: Product solves a problem and has an attractive niche in a large market. Very strong value proposition to customers. Clear customer identification with unique positioning in mostly untapped market (more than or equals to USD 1 billion).
Innovation: Some unique IP, patents or data (pending patent)
Business Model: Good revenue model / business model is defined and has been validated with large number of customers
Scalability: Solution has no issues to scale globally or within home country but scaling has not started
Traction: Prototype testing with initial customers (Beta testing)
"""

# Determine eligibility and store the result
result = determine_eligibility(input_text)

# Create a DataFrame from the result
df = pd.DataFrame([result])

# Save the DataFrame to a CSV file
df.to_csv('eligibility_results.csv', index=False)

# Print the result
print(result)

{'text': 'Company Name: Startup 1\nGender: FEMALE\nCompany Description: Dropbox lets you save and access all your files and photos in one place for easy sharing. Easily share files & access team content from your computer, mobile or any web browser. \nCompany Website: https://www.dropbox.com/\nJob Titles: Chief Operating Officer (COO)/ Head of Operations\nBusiness Model: nan\nRevenue: $50,001 - $250,000 (USD)\nProfit: Not generating profit yet\nTotal External Funding: 4000000\nNotable Investors: Y Combinator, Sequioa Capital\nCompetition Region: North America\n\nTeam Size: Complementary team with some founders having significant work experience\nMarket Opportunity / Problem to be solved: Product solves a problem and has an attractive niche in a large market. Very strong value proposition to customers. Clear customer identification with unique positioning in mostly untapped market (more than or equals to USD 1 billion).\nInnovation: Some unique IP, patents or data (pending patent)\nBusi