<a href="https://colab.research.google.com/github/leonakouame/demo-repo/blob/main/Sourcing_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import all necessary packages
import openai
import os
import json
import requests
from langchain import hub
from langchain import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from langchain.agents import AgentExecutor, create_openai_functions_agent, load_tools

import requests
from bs4 import BeautifulSoup
import html2text

import urllib.parse
from lxml import html
from googlesearch import search
from google.colab import files

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import random

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Final: Sourcing Agent

In [None]:
!pip install openai langchain-openai langchain google-search-results langchainhub html2text

In [None]:
# LLM Agent to extract startups: Simplified query

def extract_startup_urls(query, llm, tools):
    prompt = hub.pull("hwchase17/openai-functions-agent")
    agent = create_openai_functions_agent(llm, tools, prompt)
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
    input = f"Find 3 {query} startup companies backed by Y-Combinator. Output a Python list of each company's ycombinator urls in the format ['url1','url2'] without any extra information."
    response = agent_executor.invoke({"input": input})
    company_urls = response['output']
    return company_urls

In [None]:
# LLM Agent to extract startups: Extended query

def extract_startup_urls(query, llm, tools):

    prompt = hub.pull("hwchase17/openai-functions-agent")
    agent = create_openai_functions_agent(llm, tools, prompt)
    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

    input = f"""Find 10 {query} startups that are not called Pachama, Bend or Pina Earth. Ensure that the startups are listed on Y-Combinator's website (ycombinator.com). Retrieve the URLs of these startups from the search results and output a Python list containing these Y-Combinator URLs in the format without any extra information. For example: ['https://www.ycombinator.com/companies/company-name1'] where you will replace company-name1 with the real company name, using - to separate between spaces.

Consider the following instructions:
1. Ensure that the startups retrieved are specifically backed by Y-Combinator.
2. Extract only the URLs of the startups from the search results.
3. Provide the URLs in the output list without any additional information.
4. Use a reliable method to extract the Y-Combinator URLs accurately.

You are not allowed to make any assumptions while extracting the information. Every link you provide should be from the information given. There should be no assumptions for Links/URLs. You should not return code to do it.
"""
    response = agent_executor.invoke({"input": input})


    company_urls = response['output']
    return company_urls

    # try:
    #     # Convert the string representation to a list of URLs
    #     urls = json.loads(user_data["text"])
    #     return urls
    # except json.JSONDecodeError:
    #     print("Error decoding JSON")
    #     return None


In [None]:
# Extracting html text data for a given ycombinator url
def extract_html_from_url(url, max_retries=3): # Inconsistent result so allow the function to try again as a simple error handling method
    for attempt in range(max_retries):
        try:
            # Fetch HTML content from the URL using requests
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise an exception for bad responses (4xx and 5xx)

            # Parse HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            excluded_tagNames = ['footer', 'nav']
            # Exclude elements with class names 'footer' and 'navbar'
            excluded_tags = excluded_tagNames or []  # Default to an empty list if not provided
            for tag_name in excluded_tags:
                for unwanted_tag in soup.find_all(tag_name):
                    unwanted_tag.extract()

            # Convert HTML to plain text using html2text
            text_content = html2text.html2text(str(soup))

            # Return the successfully extracted content
            return text_content

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data from {url}: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying ({attempt + 1}/{max_retries})...")
            else:
                print(f"All attempts failed. Giving up.")

In [None]:
# Given a y combinator url, output a dictionary with relevant investment information
def extract_company_info(url: str, llm):
    print("Hello LLM")

    summary_template = """ given the company information {information} of a company on Y-combinator page in html format, I want you to extract information about the company. You are not allowed to make any assumptions while extracting the information. Every link you provide should be from the information given. There should be no assumptions for Links/URLS. You should not return code to do it.:
            You should extract the following text information from the html of the company page and save it as a dictionary. Include the following fields:
            1. name: Full Name of the company.
            2. descr: Description of the company.
            3. url: URL of the company.
            4. founders: Names of the Founder/Founders of the company.
            5. us: 1 if the company location is in the US, 0 else.
            6. minority_founder - 1 if there is a Black/ LatinX/Women founder, 0 else.
            7. founding_experience: 1 if founders have previous founding experience.
            8. funding: Total Funding amount to date (just the number without the $ or M sign).
            9. reputable_investors: 1 if Reputable Investors back the startup - think investors like Sequoia, else 0.
            10. market: one word to summarize the market segment they are in - look in the tags for things like Healthtech, AI etc and summarize what their focus is.
            11. age: Company Age - find when the company was founded and calculate age from this year (2024).
        """

    prompt = PromptTemplate(
        template=summary_template,
        input_variables=["information"],#list of dynamic data variables
    )

    llm_chain = LLMChain(llm=llm, prompt=prompt)
    company_profile_data = extract_html_from_url(url)

    user_data = llm_chain.invoke(
            input={"information": company_profile_data},
            return_only_outputs=True,
        )

    try:
        # Convert the string representation to a dictionary
        company_info_dict = json.loads(user_data["text"])
        return company_info_dict
    except json.JSONDecodeError:
        print(f"Error decoding JSON for URL: {url}")
        return None


In [None]:
# Ensures names are in correct format for crunchbase API
def format_founder_names(founder_names):
    formatted_names = [founder.lower().replace(" ", "-") for founder in founder_names]
    return formatted_names

In [None]:
# Using Crunchbase API to extract founder data
def get_person_data(query, api_key):
    base_url = "https://api.crunchbase.com/api/v4/entities/people"
    user_key = api_key  # Replace with your Crunchbase API key

    # Construct the full URL with query parameters
    url = f"{base_url}/{query}?field_ids=num_exits,num_founded_organizations,num_current_jobs&card_id=degrees&user_key={user_key}"

    try:
        # Make the GET request to the Crunchbase API
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad responses (4xx and 5xx)

        # Parse the JSON response
        people_data = response.json()

        # Extract the relevant information
        result_dict = {"school": [], "degree": [], "num_exits": 0, "num_founded_organizations": 0, "num_current_jobs": 0}

        if "cards" in people_data and "degrees" in people_data["cards"]:
            degrees = people_data["cards"]["degrees"]

            for degree in degrees:
                subject = degree.get("subject", "")
                school_name = degree.get("school_identifier", {}).get("value", "")

                if subject and school_name:
                    result_dict["degree"].append(subject)
                    result_dict["school"].append(school_name)

        properties = people_data.get("properties", {})
        result_dict["num_exits"] = properties.get("num_exits", 0)
        result_dict["num_founded_organizations"] = properties.get("num_founded_organizations", 0)
        result_dict["num_current_jobs"] = properties.get("num_current_jobs", 0)

        return result_dict

    except requests.exceptions.RequestException as e:
        # print(f"Error fetching data from Crunchbase API: {e}")
        return None


In [None]:
# Factor-based ranking

def rank_companies_with_justification(companies, llm):
    print("Hello LLM")

    summary_template = """Given a list of dictionaries {information} containing information about companies and their founders, rank the companies based on their investment potential. Output a numbered list along with a justification for your ranking. You are not allowed to make any assumptions while extracting the information. Every piece of data should be derived from the information provided, including links or URLs.

    Consider the following criteria for ranking:

    1. **Location (US Presence):** A higher ranking is warranted if the company is based in the US (us: 1).

    2. **Diversity in Leadership:** Companies with minority founders should receive a higher ranking (minority_founder: 1).

    3. **Founder Experience:** A higher ranking should be assigned if the founders have previous founding experience (founding_experience: 1).

    4. **Funding:** Companies with total funding amount less than $10M should receive a higher ranking. (funding < 10 ).

    5. **Reputable Investors:** A higher ranking should be given to companies backed by reputable investors (reputable_investors: 1).

    6. **Company Age:** Higher company age generally indicates stability and success. Age should be a medium-weight factor in ranking. If the company has a lower age but meets at least 80% of the criteria mentioned above, consider giving it a higher ranking.

    For the information within 'founder_data', give higher weightage to founders with:

    - Higher numbers of exits
    - More founded organizations
    - Lower current number of jobs

    Ensure these factors are reflected in your ranking justification.

    Output format:
    1. Company Name
      - Description: Company description goes here.
      - URL: Company URL goes here.
      - Founders: Founder names go here.
      - Justification: Justification for the ranking goes here.
    """


    prompt = PromptTemplate(
        template=summary_template,
        input_variables=["information"],#list of dynamic data variables
    )

    llm_chain = LLMChain(llm=llm, prompt=prompt)

    user_data = llm_chain.invoke(
            input={"information": companies},
            return_only_outputs=True,
        )

    try:
        # Convert the string representation to a dictionary
        return user_data["text"]


    except json.JSONDecodeError:
        print(f"Error decoding JSON for URL: {url}")
        return None


In [None]:
def main():
    openai_api_key = '' # Redacted
    serpapi_api_key = '' # Redacted
    crunchbase_api_key = '' # Redacted

  # Initialize your ChatOpenAI instance
    llm = ChatOpenAI(openai_api_key=openai_api_key, temperature=0, model_name="gpt-3.5-turbo")

    # Load necessary tools
    tool_names = ['serpapi']
    tools = load_tools(tool_names, llm, serpapi_api_key=serpapi_api_key)

    # Extract startup URLs
    query = 'climate finance'
    # company_urls = extract_startup_urls(query, llm, tools) # Requires work to fix bugs/inconsistent information retrieval
    company_urls = ['https://www.ycombinator.com/companies/pachama', 'https://www.ycombinator.com/companies/remora', 'https://www.ycombinator.com/companies/earth-ai', 'https://www.ycombinator.com/companies/enerjazz', 'https://www.ycombinator.com/companies/carbon-crusher', 'https://www.ycombinator.com/companies/valor-water-analytics'] # Tested with pre-defined list to ensure it works

    # Evaluate the string representation as a list
    # company_urls = eval(company_urls) # Uncomment to convert extracted startup string to list

    # Extract company information
    company_info_list = []
    for url in company_urls:
        company_info = extract_company_info(url, llm)
        if company_info:  # Check if company_info is not None
            company_info_list.append(company_info)
            print(company_info)

    # Format founder names and get additional data using Crunchbase API
    for company_info in company_info_list:
        founders = company_info.get('founders', [])
        formatted_founders = format_founder_names(founders)
        for formatted_founder in formatted_founders:
            founder_data = get_person_data(formatted_founder, crunchbase_api_key)
            company_info.setdefault('founder_data', []).append(founder_data)

    # Rank companies with justification
    rank_result = rank_companies_with_justification(company_info_list, llm)
    return rank_result

if __name__ == "__main__":
    rank_result = main()
    # Now you can use rank_result outside the main function
    print(rank_result)

Hello LLM
{'name': 'Pachama', 'descr': 'Pachama is a leading climate-tech company harnessing cutting-edge technologies such as computer vision and satellites to drive funding to effective reforestation and conservation projects that sequester carbon, enhance biodiversity and enrich local communities around the world.', 'url': 'http://pachama.com', 'founders': ['Diego Saez Gil', 'Tomas Aftalion'], 'us': 1, 'minority_founder': 0, 'founding_experience': 1, 'funding': 64, 'reputable_investors': 1, 'market': 'Climate', 'age': 6}
Hello LLM
{'name': 'Remora', 'descr': "Carbon capture for semi-trucks. We're building a device that captures a semi-truck’s carbon emissions directly from the tailpipe. We'll sell the CO2 to end-users, and share that revenue with our customers, so our device will create a new revenue stream for our customers while reducing their emissions.", 'url': 'https://www.remoracarbon.com', 'founders': ['Paul Gross'], 'us': 1, 'minority_founder': 0, 'founding_experience': 0, '

## Extending founder enrichment with LinkedIn data

In [None]:
linkedin_api_key = '' # Redacted

In [None]:

# Creating the URL endpoint in the correct format for the Diffbot API
def construct_diffbot_url(linkedin_uri, linkedin_api_key):
    # Encode the LinkedIn URI
    encoded_linkedin_uri = urllib.parse.quote(linkedin_uri, safe='')

    # Construct the URL
    url = f'https://kg.diffbot.com/kg/v3/dql?type=query&token={linkedin_api_key}&query=type%3APerson+linkedInUri%3A%22{encoded_linkedin_uri}%22&size=1'

    return url


In [None]:


# Extracted the first search result to get a founder's LinkedIn profile
def extract_linkedin_url(name):
  query = f'{name} site:linkedin.com'

  ## Google Search query results as a Python List of URLs
  search_result_list = list(search(query, tld="co.in", num=10, stop=3, pause=1))
  url = search_result_list[0]

  return url

In [None]:
# Example implementation

linkedin_uri = extract_linkedin_url('Yiğit Ihlamur')
linkedin_api_key = '' #Redacted

diffbot_url = construct_diffbot_url(linkedin_uri, linkedin_api_key)
diffbot_url


In [None]:
# A function which generates a jsonfile with a founder's LinkedIn data
def call_diffbot_api(url):
    try:
        # Call the Diffbot API
        response = requests.get(url)
        # Check if the request was successful
        if response.status_code == 200:
            return response.json()
        else:
            print("Error:", response.status_code)
            return None
    except Exception as e:
        print("Error occurred:", e)
        return None

In [None]:
# Example URL
data = call_diffbot_api(diffbot_url)

In [None]:
from google.colab import files

# Uploading a pre-existing json file to test API without making additional function calls
uploaded = files.upload()


Saving diego.json to diego.json


In [None]:
def extract_institutions(data):
    # Iterate over the 'educations' array
    institutions = [education['institution']['name'] for education in data['data'][0]['entity']['educations']]
    return institutions

def extract_skills(data):
    # Extract skills
    skills = [skill['name'] for skill in data['data'][0]['entity']['skills']]
    return skills

def extract_employer_names(data):
    # Extract employer names
    employer_names = [employment['employer']['name'] for employment in data['data'][0]['entity']['employments']]
    return employer_names

def extract_employment_titles(data):
    # Extract previous employment titles
    employment_titles = [employment['title'] for employment in data['data'][0]['entity']['employments']]
    return employment_titles

# Load your JSON data
with open('diego.json', 'r') as file:
    data = json.load(file)

# Extracting information
institutions = extract_institutions(data)
skills = extract_skills(data)
employer_names = extract_employer_names(data)
employment_titles = extract_employment_titles(data)

# Print the results
print("Institutions:", institutions)
print("Skills:", skills)
print("Employer Names:", employer_names)
print("Previous Employment Titles:", employment_titles)


Institutions: ['Y Combinator', 'Stanford Graduate School of Business', 'Universidad Nacional de Tucumán', 'La Salle BCN', 'La Salle Business School', 'Universidad Nacional de Tucumán', 'Technical School Concepcion', 'Universitat Ramón Llull']
Skills: ['application development', 'management', 'economics', 'teaching', 'startup', 'business development', 'product development', 'project management', 'entrepreneurship', 'product design', 'team building', 'leadership', 'Strategic Partnerships', 'Mobile Applications', 'Start-ups', 'Starting Startups', 'Lidar', 'RedDot']
Employer Names: ['Pachama', 'Bluesmart', 'StudentUniverse.com', 'WeHostels', 'PricewaterhouseCoopers', 'Altran', 'Off Track Planet', 'New York Casas', 'Socialatom Ventures', 'TA Ventures', 'Mascota Nube', 'Degusta', 'Viajala', 'Altran Europe']
Previous Employment Titles: ['Co-Founder & CEO', 'Co-founder, CEO', 'Vice President, Mobile', 'Co-founder & CEO', 'Associate', 'Associate', 'Team Member', 'Customer', 'Mentor', 'Employee'

# Ranking Algorithm - Manual Solution

*This was an experiment to see how we could improve ranking using pre-existing machine learning solutions with simulated data.*

## Ranking by query similarity

In [None]:
def rank_companies_by_similarity(query, companies):
    """
    Rank companies based on the similarity of the query to their descriptions.

    Args:
    - query (str): The query for which companies are ranked.
    - companies (list): List of dictionaries representing companies with 'name' and 'description' attributes.

    Returns:
    - list: Ranked list of companies with added 'similarity_score' attribute.
    """
    # Extract company descriptions and calculate similarity scores
    vectorizer = TfidfVectorizer(stop_words='english')
    company_descriptions = [company["description"] for company in companies]
    company_descriptions.append(query)  # Include the query in the list

    tfidf_matrix = vectorizer.fit_transform(company_descriptions)
    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    # Combine similarity scores with companies
    for i, company in enumerate(companies):
        company["similarity_score"] = similarity_scores[0][i]

    # Rank companies based on similarity score
    ranked_companies = sorted(companies, key=lambda x: x["similarity_score"], reverse=True)

    return ranked_companies

In [None]:
# Example usage:
query = "AI agent framework and AI agent developer tool startups"
companies = [
    {"name": "Company A", "description": "AI and machine learning solutions", "funding": 5000000},
    {"name": "Company B", "description": "Developer tools for AI agents", "funding": 8000000},
    {"name": "Company C", "description": "Blockchain-based financial services", "funding": 3000000},
    # Add more companies as needed
]

# Rank companies based on similarity
ranked_companies = rank_companies_by_similarity(query, companies)

# Display ranked list of companies
print("Ranked List of Companies based on Similarity:")
for rank, company in enumerate(ranked_companies, start=1):
    print(f"{rank}. {company['name']} (Similarity Score: {company['similarity_score']}, Funding: ${company['funding']})")

Ranked List of Companies based on Similarity:
1. Company B (Similarity Score: 0.2713499543607831, Funding: $8000000)
2. Company A (Similarity Score: 0.14512759651136553, Funding: $5000000)
3. Company C (Similarity Score: 0.0, Funding: $3000000)


## Extending the ranking to top tier universities, employers, and start-up location

In [None]:
def rank_companies(query, companies):
    """
    Rank companies based on various criteria including similarity to the query,
    founder's location, founder's education, and founder's employment history.

    Args:
    - query (str): The query for which companies are ranked.
    - companies (list): List of dictionaries representing companies with attributes.

    Returns:
    - list: Ranked list of companies with added 'rank_score' attribute.
    """
    # Extract company descriptions and calculate similarity scores
    vectorizer = TfidfVectorizer(stop_words='english')
    company_descriptions = [company["description"] for company in companies]
    company_descriptions.append(query)  # Include the query in the list

    tfidf_matrix = vectorizer.fit_transform(company_descriptions)
    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

    # Combine similarity scores with companies
    for i, company in enumerate(companies):
        company["similarity_score"] = similarity_scores[0][i]

    # Define top-tier universities and employers
    top_tier_universities = {"Oxford", "Harvard", "MIT", "Stanford", "Cambridge"}
    top_tier_employers = {"Google", "Facebook", "Amazon", "Microsoft", "Apple"}

    # Rank companies based on multiple criteria with adjusted weights
    ranked_companies = sorted(companies, key=lambda x: (
        0.5 * x["similarity_score"],  # Higher weight for similarity score
        0.2 if x["founder_location"] == "US" else 0,  # Adjusted weight for location
        0.15 if x["founder_university"] in top_tier_universities else 0,  # Adjusted weight for university
        0.15 if x["founder_employer"] in top_tier_employers else 0  # Adjusted weight for employer
    ), reverse=True)

    # Add a 'rank_score' attribute for visualization purposes (optional)
    for rank, company in enumerate(ranked_companies, start=1):
        company["rank_score"] = rank

    return ranked_companies

In [None]:
# Example usage:
query = "AI agent framework and AI agent developer tool startups"
companies = [
    {"name": "Company A", "description": "AI and machine learning solutions", "funding": 5000000,
     "founder_location": "US", "founder_university": "Oxford", "founder_employer": "Google"},
    {"name": "Company B", "description": "Developer tools for AI agents", "funding": 8000000,
     "founder_location": "UK", "founder_university": "Harvard", "founder_employer": "Microsoft"},
    {"name": "Company C", "description": "Blockchain-based financial services", "funding": 3000000,
     "founder_location": "US", "founder_university": "MIT", "founder_employer": "Facebook"},
    {"name": "Company D", "description": "Automated data analytics platform", "funding": 6000000,
     "founder_location": "Canada", "founder_university": "University of Toronto", "founder_employer": "IBM"},
    {"name": "Company E", "description": "AI-driven e-commerce solutions", "funding": 7500000,
     "founder_location": "Germany", "founder_university": "Technical University of Munich", "founder_employer": "Siemens"},
    {"name": "Company F", "description": "Healthcare AI for diagnostics", "funding": 4000000,
     "founder_location": "US", "founder_university": "University of California, Berkeley", "founder_employer": "Intel"},
    {"name": "Company G", "description": "Augmented reality development tools", "funding": 5500000,
     "founder_location": "US", "founder_university": "Carnegie Mellon University", "founder_employer": "Qualcomm"},
    # Add more companies with diverse attributes as needed
]

# Rank companies based on multiple criteria
ranked_companies = rank_companies(query, companies)

# Display ranked list of companies with additional attributes
print("Ranked List of Companies based on Multiple Criteria:")
for rank, company in enumerate(ranked_companies, start=1):
    print(f"{rank}. {company['name']} (Rank Score: {company['rank_score']}, "
          f"Similarity Score: {company['similarity_score']}, Funding: ${company['funding']}, "
          f"Founder Location: {company['founder_location']}, "
          f"Founder University: {company['founder_university']}, "
          f"Founder Employer: {company['founder_employer']})")


Ranked List of Companies based on Multiple Criteria:
1. Company B (Rank Score: 1, Similarity Score: 0.2698746843339195, Funding: $8000000, Founder Location: UK, Founder University: Harvard, Founder Employer: Microsoft)
2. Company F (Rank Score: 2, Similarity Score: 0.1383193285627435, Funding: $4000000, Founder Location: US, Founder University: University of California, Berkeley, Founder Employer: Intel)
3. Company A (Rank Score: 3, Similarity Score: 0.12115581948871945, Funding: $5000000, Founder Location: US, Founder University: Oxford, Founder Employer: Google)
4. Company E (Rank Score: 4, Similarity Score: 0.12115581948871945, Funding: $7500000, Founder Location: Germany, Founder University: Technical University of Munich, Founder Employer: Siemens)
5. Company C (Rank Score: 5, Similarity Score: 0.0, Funding: $3000000, Founder Location: US, Founder University: MIT, Founder Employer: Facebook)
6. Company G (Rank Score: 6, Similarity Score: 0.0, Funding: $5500000, Founder Location: U

Now, we are going to extend the list to 20 companies, and try to filter only the top 10 most relevant

In [None]:
query = "AI agent framework and AI agent developer tool startups"
companies = [
    {"name": "Company A", "description": "AI and machine learning solutions", "funding": 5000000,
     "founder_location": "US", "founder_university": "Oxford", "founder_employer": "Google"},
    {"name": "Company B", "description": "Developer tools for AI agents", "funding": 8000000,
     "founder_location": "UK", "founder_university": "Harvard", "founder_employer": "Microsoft"},
    {"name": "Company C", "description": "Blockchain-based financial services", "funding": 3000000,
     "founder_location": "US", "founder_university": "MIT", "founder_employer": "Facebook"},
    {"name": "Company D", "description": "Automated data analytics platform", "funding": 6000000,
     "founder_location": "Canada", "founder_university": "University of Toronto", "founder_employer": "IBM"},
    {"name": "Company E", "description": "AI-driven e-commerce solutions", "funding": 7500000,
     "founder_location": "Germany", "founder_university": "Technical University of Munich", "founder_employer": "Siemens"},
    {"name": "Company F", "description": "Healthcare AI for diagnostics", "funding": 4000000,
     "founder_location": "US", "founder_university": "University of California, Berkeley", "founder_employer": "Intel"},
    {"name": "Company G", "description": "Augmented reality development tools", "funding": 5500000,
     "founder_location": "US", "founder_university": "Carnegie Mellon University", "founder_employer": "Qualcomm"},
    {"name": "Company H", "description": "AI-powered marketing platform", "funding": 7000000,
     "founder_location": "India", "founder_university": "IIT Delhi", "founder_employer": "Infosys"},
    {"name": "Company I", "description": "Cybersecurity solutions using AI", "funding": 3500000,
     "founder_location": "Israel", "founder_university": "Technion", "founder_employer": "Check Point"},
    {"name": "Company J", "description": "Robotics and automation technology", "funding": 4500000,
     "founder_location": "US", "founder_university": "MIT", "founder_employer": "Amazon"},
    {"name": "Company K", "description": "Chatbot development platform", "funding": 6500000,
     "founder_location": "US", "founder_university": "Stanford", "founder_employer": "Facebook"},
    {"name": "Company L", "description": "AI-powered video analytics", "funding": 5000000,
     "founder_location": "Germany", "founder_university": "RWTH Aachen University", "founder_employer": "Bosch"},
    {"name": "Company M", "description": "Quantum computing solutions", "funding": 6000000,
     "founder_location": "US", "founder_university": "Caltech", "founder_employer": "Google"},
    {"name": "Company N", "description": "Blockchain-based supply chain management", "funding": 5500000,
     "founder_location": "US", "founder_university": "Harvard", "founder_employer": "Microsoft"},
    {"name": "Company O", "description": "AI-powered language translation", "funding": 7000000,
     "founder_location": "China", "founder_university": "Tsinghua University", "founder_employer": "Baidu"},
    {"name": "Company P", "description": "Automated financial trading using AI", "funding": 4500000,
     "founder_location": "US", "founder_university": "Stanford", "founder_employer": "Goldman Sachs"},
    {"name": "Company Q", "description": "Virtual reality content creation", "funding": 8000000,
     "founder_location": "US", "founder_university": "MIT", "founder_employer": "Facebook"},
    {"name": "Company R", "description": "AI in personalized healthcare", "funding": 3500000,
     "founder_location": "US", "founder_university": "UC San Francisco", "founder_employer": "Genentech"},
    {"name": "Company S", "description": "Smart home automation using AI", "funding": 5000000,
     "founder_location": "US", "founder_university": "Stanford", "founder_employer": "Apple"},
    {"name": "Company T", "description": "AI for agricultural productivity", "funding": 6000000,
     "founder_location": "US", "founder_university": "UC Davis", "founder_employer": "John Deere"},
    # Add more companies with diverse attributes as needed
]

In [None]:
# Rank companies based on multiple criteria
ranked_companies = rank_companies(query, companies)

# Choose the top 10 companies
top_10_companies = ranked_companies[:10]

# Display the top 10 companies with additional attributes
print("Top 10 Companies based on Multiple Criteria:")
for rank, company in enumerate(top_10_companies, start=1):
    print(f"{rank}. {company['name']} (Rank Score: {company['rank_score']}, "
          f"Similarity Score: {company['similarity_score']}, Funding: ${company['funding']}, "
          f"Founder Location: {company['founder_location']}, "
          f"Founder University: {company['founder_university']}, "
          f"Founder Employer: {company['founder_employer']})")

Top 10 Companies based on Multiple Criteria:
1. Company B (Rank Score: 1, Similarity Score: 0.23657067854644498, Funding: $8000000, Founder Location: UK, Founder University: Harvard, Founder Employer: Microsoft)
2. Company F (Rank Score: 2, Similarity Score: 0.08948395865601717, Funding: $4000000, Founder Location: US, Founder University: University of California, Berkeley, Founder Employer: Intel)
3. Company R (Rank Score: 3, Similarity Score: 0.08948395865601717, Funding: $3500000, Founder Location: US, Founder University: UC San Francisco, Founder Employer: Genentech)
4. Company T (Rank Score: 4, Similarity Score: 0.08475878571355247, Funding: $6000000, Founder Location: US, Founder University: UC Davis, Founder Employer: John Deere)
5. Company I (Rank Score: 5, Similarity Score: 0.08169081729375595, Funding: $3500000, Founder Location: Israel, Founder University: Technion, Founder Employer: Check Point)
6. Company H (Rank Score: 6, Similarity Score: 0.08000216101401232, Funding: $7

Improving readability of the output

In [None]:
# Display the top 10 companies with a fact file for each company
print("Top 10 Companies based on Query Similarity, Founder Location, University, and Employer:")
for rank, company in enumerate(top_10_companies, start=1):
    print(f"\n{rank}. {company['name']}")
    print(f"   Description: {company['description']}")
    # print(f"   Website: {company['website']}")
    print(f"   Funding: ${company['funding']}")
    print(f"   Founder Location: {company['founder_location']}")
    print(f"   Founder University: {company['founder_university']}")
    print(f"   Founder Employer: {company['founder_employer']}")
    print(f"   Similarity Score: {company['similarity_score']}")


Top 10 Companies based on Query Similarity, Founder Location, University, and Employer:

1. Company B
   Description: Developer tools for AI agents
   Funding: $8000000
   Founder Location: UK
   Founder University: Harvard
   Founder Employer: Microsoft
   Similarity Score: 0.23657067854644498

2. Company F
   Description: Healthcare AI for diagnostics
   Funding: $4000000
   Founder Location: US
   Founder University: University of California, Berkeley
   Founder Employer: Intel
   Similarity Score: 0.08948395865601717

3. Company R
   Description: AI in personalized healthcare
   Funding: $3500000
   Founder Location: US
   Founder University: UC San Francisco
   Founder Employer: Genentech
   Similarity Score: 0.08948395865601717

4. Company T
   Description: AI for agricultural productivity
   Funding: $6000000
   Founder Location: US
   Founder University: UC Davis
   Founder Employer: John Deere
   Similarity Score: 0.08475878571355247

5. Company I
   Description: Cybersecurity

## Feature Engineering - creating a score based on the university and employer

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(query, company_descriptions):
    """
    Calculate similarity scores between the query and company descriptions.

    Args:
    - query (str): The query for which companies are ranked.
    - company_descriptions (list): List of company descriptions.

    Returns:
    - list: Similarity scores between the query and company descriptions.
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    company_descriptions.append(query)  # Include the query in the list
    tfidf_matrix = vectorizer.fit_transform(company_descriptions)
    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    return similarity_scores[0]

def rank_companies_based_on_similarity(query, companies):
    """
    Rank companies based on similarity to the query.

    Args:
    - query (str): The query for which companies are ranked.
    - companies (list): List of dictionaries representing companies with attributes.

    Returns:
    - list: Ranked list of companies with added 'similarity_score' attribute.
    """
    company_descriptions = [company["description"] for company in companies]
    similarity_scores = calculate_similarity(query, company_descriptions)

    for i, company in enumerate(companies):
        company["similarity_score"] = similarity_scores[i]

    return companies

def rank_companies_based_on_criteria(companies):
    """
    Rank companies based on multiple criteria with adjusted weights.

    Args:
    - companies (list): List of dictionaries representing companies with attributes.

    Returns:
    - list: Ranked list of companies with added 'rank_score' attribute.
    """
    # Define top-tier universities and employers
    top_tier_universities = {"Oxford", "Harvard", "MIT", "Stanford", "Cambridge"}
    top_tier_employers = {"Google", "Facebook", "Amazon", "Microsoft", "Apple"}

    # Rank companies based on multiple criteria with adjusted weights
    ranked_companies = sorted(companies, key=lambda x: (
        0.5 * x["similarity_score"],  # Higher weight for similarity score
        0.2 if x["founder_location"] == "US" else 0,  # Adjusted weight for location
        0.15 if x["founder_university"] in top_tier_universities else 0,  # Adjusted weight for university
        0.15 if x["founder_employer"] in top_tier_employers else 0  # Adjusted weight for employer
    ), reverse=True)

    return ranked_companies

def assign_score(value, ranking_dict):
    """
    Assign a score based on the value and ranking dictionary.

    Args:
    - value (str): The value to be scored.
    - ranking_dict (dict): Dictionary containing scores for different values.

    Returns:
    - int: The assigned score.
    """
    if value in ranking_dict:
        return ranking_dict[value]
    else:
        return ranking_dict["Other"]

def rank_companies_with_scores(companies, university_ranking, employer_ranking):
    """
    Rank companies based on multiple criteria with assigned scores.

    Args:
    - companies (list): List of dictionaries representing companies with attributes.
    - university_ranking (dict): Dictionary containing scores for universities.
    - employer_ranking (dict): Dictionary containing scores for employers.

    Returns:
    - list: Ranked list of companies with added 'rank_score' attribute.
    """
    for company in companies:
        company["university_score"] = assign_score(company["founder_university"], university_ranking)
        company["employer_score"] = assign_score(company["founder_employer"], employer_ranking)

    ranked_companies = sorted(companies, key=lambda x: (
        0.5 * x["similarity_score"],  # Higher weight for similarity score
        0.2 if x["founder_location"] == "US" else 0,  # Adjusted weight for location
        0.15 * x["university_score"],  # Adjusted weight for university score
        0.15 * x["employer_score"]  # Adjusted weight for employer score
    ), reverse=True)

    return ranked_companies

In [None]:
# Define ranking dictionaries for universities and employers
university_ranking = {"Oxford": 10, "Harvard": 9, "MIT": 8, "Stanford": 7, "Cambridge": 6, "Other": 5}
employer_ranking = {"Google": 10, "Facebook": 9, "Amazon": 8, "Microsoft": 7, "Apple": 6, "Other": 5}

# Rank companies based on similarity
companies = rank_companies_based_on_similarity(query, companies)

# Rank companies based on multiple criteria with adjusted weights
ranked_companies = rank_companies_based_on_criteria(companies)

# Rank companies based on multiple criteria with assigned scores
final_ranking = rank_companies_with_scores(ranked_companies, university_ranking, employer_ranking)

# Choose the top 10 companies
top_10_companies = final_ranking[:10]

In [None]:
final_ranking

[{'name': 'Company B',
  'description': 'Developer tools for AI agents',
  'funding': 8000000,
  'founder_location': 'UK',
  'founder_university': 'Harvard',
  'founder_employer': 'Microsoft',
  'similarity_score': 0.23657067854644498,
  'university_score': 9,
  'employer_score': 7},
 {'name': 'Company F',
  'description': 'Healthcare AI for diagnostics',
  'funding': 4000000,
  'founder_location': 'US',
  'founder_university': 'University of California, Berkeley',
  'founder_employer': 'Intel',
  'similarity_score': 0.08948395865601717,
  'university_score': 5,
  'employer_score': 5},
 {'name': 'Company R',
  'description': 'AI in personalized healthcare',
  'funding': 3500000,
  'founder_location': 'US',
  'founder_university': 'UC San Francisco',
  'founder_employer': 'Genentech',
  'similarity_score': 0.08948395865601717,
  'university_score': 5,
  'employer_score': 5},
 {'name': 'Company T',
  'description': 'AI for agricultural productivity',
  'funding': 6000000,
  'founder_loca

In [None]:
# Display the top 10 companies with a fact file for each company
print("Top 10 Companies based on Query Similarity, Founder Location, University, and Employer:")
for rank, company in enumerate(top_10_companies, start=1):
    print(f"\n{rank}. {company['name']}")
    print(f"   Description: {company['description']}")
    # print(f"   Website: {company['website']}")
    print(f"   Funding: ${company['funding']}")
    print(f"   Founder Location: {company['founder_location']}")
    print(f"   Founder University: {company['founder_university']}")
    print(f"   Founder Employer: {company['founder_employer']}")
    print(f"   Similarity Score: {company['similarity_score']}")

Top 10 Companies based on Query Similarity, Founder Location, University, and Employer:

1. Company B
   Description: Developer tools for AI agents
   Funding: $8000000
   Founder Location: UK
   Founder University: Harvard
   Founder Employer: Microsoft
   Similarity Score: 0.23657067854644498

2. Company F
   Description: Healthcare AI for diagnostics
   Funding: $4000000
   Founder Location: US
   Founder University: University of California, Berkeley
   Founder Employer: Intel
   Similarity Score: 0.08948395865601717

3. Company R
   Description: AI in personalized healthcare
   Funding: $3500000
   Founder Location: US
   Founder University: UC San Francisco
   Founder Employer: Genentech
   Similarity Score: 0.08948395865601717

4. Company T
   Description: AI for agricultural productivity
   Funding: $6000000
   Founder Location: US
   Founder University: UC Davis
   Founder Employer: John Deere
   Similarity Score: 0.08475878571355247

5. Company I
   Description: Cybersecurity

## Improving the Ranking

In [None]:
def get_user_top1(companies):
    """
    Simulate user feedback by asking the user to provide their top choice from the given list of companies.

    Args:
    - companies (list): List of companies to rank.

    Returns:
    - dict: The user's top choice company.
    """
    print("Please provide your top choice company (enter the corresponding number):")
    for i, company in enumerate(companies, start=1):
        print(f"{i}. {company['name']}")

    user_top1 = None
    try:
        # Get user input for top choice
        user_input = input("Enter your top choice (number): ")
        user_top1 = int(user_input) - 1
    except ValueError:
        print("Invalid input. Please enter a valid number.")

    # Verify the input is valid
    if user_top1 is None or user_top1 < 0 or user_top1 >= len(companies):
        print("Invalid input. Please provide a valid number for your top choice.")
        return get_user_top1(companies)  # Retry if the input is invalid

    # Return the user's top choice company
    return companies[user_top1]

# Simulate user feedback for the first 10 companies in the final ranking
user_feedback_top1 = get_user_top1(final_ranking[:10])

# Display the user's top choice
print("\nUser's Top Choice:")
print(f"{user_feedback_top1['name']} (Similarity Score: {user_feedback_top1['similarity_score']}, "
      f"University Score: {user_feedback_top1['university_score']}, Employer Score: {user_feedback_top1['employer_score']})")


Please provide your top choice company (enter the corresponding number):
1. Company B
2. Company F
3. Company R
4. Company T
5. Company I
6. Company H
7. Company L
8. Company A
9. Company E
10. Company O
Enter your top choice (number): 1

User's Top Choice:
Company B (Similarity Score: 0.23657067854644498, University Score: 9, Employer Score: 7)


In [None]:
import random


def generate_simulated_data():
    return {
        "name": f"Company {random.randint(1, 100)}",
        "description": f"Description for Company {random.randint(1, 100)}",
        "funding": random.randint(1000000, 10000000),
        "founder_location": random.choice(["US", "UK", "Germany", "India", "China", "Canada", "Israel"]),
        "founder_university": f"University {random.randint(1, 20)}",
        "founder_employer": f"Employer {random.randint(1, 20)}",
        "similarity_score": random.uniform(0, 1),
        "university_score": random.randint(1, 10),
        "employer_score": random.randint(1, 10),
        "user_preference": random.randint(1, 10)
    }

# Generate a list of 100 simulated companies
companies = [generate_simulated_data() for _ in range(10)]

# Now 'companies' list contains simulated data for training a feedback-based ranking model


def collect_user_feedback(companies):
    for company in companies:
        print(f"Company: {company['name']}")
        print(f"Description: {company['description']}")
        print(f"User Preference: {company['user_preference']}")

        # Simulate user feedback (1 for relevant, 0 for not relevant)
        user_feedback = int(input("Is this company relevant? (1 for yes, 0 for no): "))
        company['user_preference'] = user_feedback

companies = [generate_simulated_data() for _ in range(10)]
collect_user_feedback(companies)

# Now 'companies' list contains simulated data with user feedback


Company: Company 59
Description: Description for Company 24
User Preference: 2
Is this company relevant? (1 for yes, 0 for no): 1
Company: Company 45
Description: Description for Company 35
User Preference: 7
Is this company relevant? (1 for yes, 0 for no): 0
Company: Company 9
Description: Description for Company 65
User Preference: 10
Is this company relevant? (1 for yes, 0 for no): 1
Company: Company 34
Description: Description for Company 7
User Preference: 2
Is this company relevant? (1 for yes, 0 for no): 1
Company: Company 12
Description: Description for Company 13
User Preference: 5
Is this company relevant? (1 for yes, 0 for no): 0
Company: Company 84
Description: Description for Company 69
User Preference: 2
Is this company relevant? (1 for yes, 0 for no): 1
Company: Company 68
Description: Description for Company 71
User Preference: 5
Is this company relevant? (1 for yes, 0 for no): 0
Company: Company 97
Description: Description for Company 1
User Preference: 9
Is this compa

In [None]:

def generate_simulated_data():
    return {
        "name": f"Company {random.randint(1, 10)}",
        "description": f"Description for Company {random.randint(1, 10)}",
        "funding": random.randint(1000000, 10000000),
        "founder_location": random.choice(["US", "UK", "Germany", "India", "China", "Canada", "Israel"]),
        "founder_university": f"University {random.randint(1, 10)}",
        "founder_employer": f"Employer {random.randint(1, 10)}",
        "similarity_score": random.uniform(0, 1),
        "university_score": random.randint(1, 10),
        "employer_score": random.randint(1, 10),
    }

# Generate a list of 10 simulated companies
companies = [generate_simulated_data() for _ in range(10)]

# Rank the companies based on similarity score
ranked_companies = sorted(companies, key=lambda x: x["similarity_score"], reverse=True)

# Display the ranked list
print("Ranked List:")
for i, company in enumerate(ranked_companies):
    print(f"{i+1}. {company['name']} - Similarity Score: {company['similarity_score']:.4f}")

# Get user input on irrelevant companies
irrelevant_ranks_str = input("Enter the ranks of irrelevant companies (comma-separated): ")
irrelevant_ranks = [int(rank) for rank in irrelevant_ranks_str.replace(" ", "").split(",")]

# Assign user preferences (1 for relevant, 0 for irrelevant)
user_preferences = [1 if i+1 not in irrelevant_ranks else 0 for i in range(len(companies))]

# Print the assigned user preferences
print("Assigned User Preferences:")
for i, preference in enumerate(user_preferences):
    print(f"{i+1}. {ranked_companies[i]['name']} - User Preference: {preference}")

# Now 'companies' list contains simulated data with assigned user preferences (1 or 0)


Ranked List:
1. Company 4 - Similarity Score: 0.9462
2. Company 1 - Similarity Score: 0.7647
3. Company 6 - Similarity Score: 0.7221
4. Company 7 - Similarity Score: 0.6852
5. Company 10 - Similarity Score: 0.6313
6. Company 2 - Similarity Score: 0.5018
7. Company 8 - Similarity Score: 0.4540
8. Company 2 - Similarity Score: 0.4519
9. Company 10 - Similarity Score: 0.3595
10. Company 10 - Similarity Score: 0.2360
Enter the ranks of irrelevant companies (comma-separated): 1
Assigned User Preferences:
1. Company 4 - User Preference: 0
2. Company 1 - User Preference: 1
3. Company 6 - User Preference: 1
4. Company 7 - User Preference: 1
5. Company 10 - User Preference: 1
6. Company 2 - User Preference: 1
7. Company 8 - User Preference: 1
8. Company 2 - User Preference: 1
9. Company 10 - User Preference: 1
10. Company 10 - User Preference: 1


In [None]:
companies

[{'name': 'Company 1',
  'description': 'Description for Company 1',
  'funding': 3408332,
  'founder_location': 'India',
  'founder_university': 'University 7',
  'founder_employer': 'Employer 9',
  'similarity_score': 0.7646798036629991,
  'university_score': 7,
  'employer_score': 6},
 {'name': 'Company 8',
  'description': 'Description for Company 7',
  'funding': 1046426,
  'founder_location': 'Canada',
  'founder_university': 'University 4',
  'founder_employer': 'Employer 7',
  'similarity_score': 0.4539980291314675,
  'university_score': 8,
  'employer_score': 10},
 {'name': 'Company 2',
  'description': 'Description for Company 6',
  'funding': 8212578,
  'founder_location': 'UK',
  'founder_university': 'University 8',
  'founder_employer': 'Employer 2',
  'similarity_score': 0.45187262600040623,
  'university_score': 6,
  'employer_score': 10},
 {'name': 'Company 10',
  'description': 'Description for Company 3',
  'funding': 5174484,
  'founder_location': 'US',
  'founder_u

In [None]:

def generate_simulated_data(index):
    return {
        "name": f"Company {index}",
        "funding": random.randint(1000000, 10000000),
        "founder_location": random.choice([0, 1]),
        "founder_university": f"University {random.randint(1, 10)}",
        "founder_employer": f"Employer {random.randint(1, 10)}",
        "similarity_score": random.uniform(0, 1),
        "university_score": random.randint(1, 10),
        "employer_score": random.randint(1, 10),
        "user_relevant": random.choice([0, 1])
    }

# Generate a list of 100 simulated companies with unique names
companies = [generate_simulated_data(index) for index in range(1, 101)]

# Now 'companies' list contains simulated data with unique company names and user_relevant labels
print(companies)


[{'name': 'Company 1', 'funding': 1347709, 'founder_location': 1, 'founder_university': 'University 5', 'founder_employer': 'Employer 5', 'similarity_score': 0.33306162729149524, 'university_score': 8, 'employer_score': 10, 'user_relevant': 0}, {'name': 'Company 2', 'funding': 7208931, 'founder_location': 0, 'founder_university': 'University 4', 'founder_employer': 'Employer 1', 'similarity_score': 0.8415778515351137, 'university_score': 6, 'employer_score': 1, 'user_relevant': 0}, {'name': 'Company 3', 'funding': 2769220, 'founder_location': 0, 'founder_university': 'University 5', 'founder_employer': 'Employer 7', 'similarity_score': 0.4707641292515614, 'university_score': 7, 'employer_score': 1, 'user_relevant': 0}, {'name': 'Company 4', 'funding': 8158460, 'founder_location': 0, 'founder_university': 'University 1', 'founder_employer': 'Employer 5', 'similarity_score': 0.7490779392209752, 'university_score': 8, 'employer_score': 8, 'user_relevant': 0}, {'name': 'Company 5', 'fundin

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming 'companies' is your training dataset with features and labels
features = ["similarity_score", "university_score", "employer_score"]
X = [[company[feature] for feature in features] for company in companies]
y = [company["user_relevant"] for company in companies]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train RankNet model (MLPRegressor is used for simplicity)
model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 0.2675551344450947


In [None]:


# Combine companies with their predicted scores
items_with_scores = list(zip(companies, y_pred))

# Sort items based on predictions
sorted_items = sorted(items_with_scores, key=lambda x: x[1], reverse=True)

# Extract top 10 ranked items
top_10_ranked = sorted_items[:10]

# Print the final ranked list
print("Final Ranked List:")
for rank, (item, score) in enumerate(top_10_ranked, start=1):
    print(f"{rank}. Name: {item['name']} - Funding: {item['funding']} - RankNet Score: {score}")


Final Ranked List:
1. Name: Company 13 - Funding: 2206694 - RankNet Score: 0.5774119043168491
2. Name: Company 16 - Funding: 9793245 - RankNet Score: 0.5355934538570508
3. Name: Company 6 - Funding: 6816328 - RankNet Score: 0.517179424910666
4. Name: Company 12 - Funding: 9637932 - RankNet Score: 0.49287511703995934
5. Name: Company 19 - Funding: 8245864 - RankNet Score: 0.47782849397841376
6. Name: Company 8 - Funding: 2549459 - RankNet Score: 0.47723494937619293
7. Name: Company 10 - Funding: 9430018 - RankNet Score: 0.47592455245945686
8. Name: Company 5 - Funding: 8610782 - RankNet Score: 0.45579057775060516
9. Name: Company 11 - Funding: 1415836 - RankNet Score: 0.43821100156852477
10. Name: Company 9 - Funding: 7565781 - RankNet Score: 0.42697370775325494


In [None]:
items_val = [company["name"] for company in companies[len(companies) - len(X_val):]]

# Sort items based on predictions
sorted_items_val = [item for _, item in sorted(zip(y_pred, items_val), reverse=True)]

# Final ranked list
print("Final Ranked List:")
for rank, item in enumerate(sorted_items_val, start=1):
    print(f"{rank}. {item}")

Final Ranked List:
1. Company 100
2. Company 91
3. Company 93
4. Company 83
5. Company 87
6. Company 81
7. Company 98
8. Company 84
9. Company 99
10. Company 85
11. Company 82
12. Company 89
13. Company 86
14. Company 92
15. Company 88
16. Company 90
17. Company 95
18. Company 97
19. Company 96
20. Company 94


In [None]:
items_val = companies[len(companies) - len(X_val):]

# Sort items based on predictions
sorted_items_val = [(item, similarity) for similarity, item in sorted(zip(y_pred, items_val), reverse=True)]

# Print the final ranked list with additional metadata
print("Final Ranked List:")
for rank, (item, score) in enumerate(top_10_ranked, start=1):
    print(f"\n{rank}. Name: {item['name']}")
    print(f"   Funding: ${item['funding']}")
    print(f"   Founder Location: {item['founder_location']}")
    print(f"   Founder University: {item['founder_university']}")
    print(f"   Founder Employer: {item['founder_employer']}")
    print(f"   Similarity Score: {item['similarity_score']}")
    print(f"   University Score: {item['university_score']}")
    print(f"   Employer Score: {item['employer_score']}")
    print(f"   RankNet Score: {score}")


Final Ranked List:

1. Name: Company 13
   Funding: $2206694
   Founder Location: 1
   Founder University: University 3
   Founder Employer: Employer 3
   Similarity Score: 0.5141432609583725
   University Score: 5
   Employer Score: 8
   RankNet Score: 0.5774119043168491

2. Name: Company 16
   Funding: $9793245
   Founder Location: 1
   Founder University: University 10
   Founder Employer: Employer 2
   Similarity Score: 0.6174934904154571
   University Score: 9
   Employer Score: 7
   RankNet Score: 0.5355934538570508

3. Name: Company 6
   Funding: $6816328
   Founder Location: 1
   Founder University: University 5
   Founder Employer: Employer 1
   Similarity Score: 0.7551943668165639
   University Score: 9
   Employer Score: 9
   RankNet Score: 0.517179424910666

4. Name: Company 12
   Funding: $9637932
   Founder Location: 0
   Founder University: University 2
   Founder Employer: Employer 8
   Similarity Score: 0.5734382202501291
   University Score: 1
   Employer Score: 9
   

Comparing our RankNet top 10 versus original ranking:

In [None]:
# Sort companies based on the specified logic
ranked_companies = sorted(companies, key=lambda x: (
    0.5 * x["similarity_score"],  # Higher weight for similarity score
    0.2 if x["founder_location"] == 1 else 0,  # Adjusted weight for location
    0.15 * x["university_score"],  # Adjusted weight for university score
    0.15 * x["employer_score"]  # Adjusted weight for employer score
), reverse=True)


In [None]:
# Display the top 10 ranked companies
print("Top 10 Companies based on Query Similarity, Founder Location, University, and Employer:")
for rank, company in enumerate(ranked_companies[:10], start=1):
    print(f"\n{rank}. {company['name']}")
    # print(f"   Description: {company['description']}")
    print(f"   Funding: ${company['funding']}")
    print(f"   Founder Location: {company['founder_location']}")
    print(f"   Founder University: {company['founder_university']}")
    print(f"   Founder Employer: {company['founder_employer']}")
    print(f"   Similarity Score: {company['similarity_score']}")
    print(f"   University Score: {company['university_score']}")
    print(f"   Employer Score: {company['employer_score']}")


Top 10 Companies based on Query Similarity, Founder Location, University, and Employer:

1. Company 86
   Funding: $8344998
   Founder Location: 0
   Founder University: University 8
   Founder Employer: Employer 3
   Similarity Score: 0.9993398993342019
   University Score: 2
   Employer Score: 6

2. Company 25
   Funding: $4867866
   Founder Location: 1
   Founder University: University 4
   Founder Employer: Employer 7
   Similarity Score: 0.9833203237806143
   University Score: 2
   Employer Score: 6

3. Company 92
   Funding: $1996114
   Founder Location: 0
   Founder University: University 1
   Founder Employer: Employer 8
   Similarity Score: 0.9795220674979878
   University Score: 6
   Employer Score: 4

4. Company 19
   Funding: $4674598
   Founder Location: 0
   Founder University: University 5
   Founder Employer: Employer 1
   Similarity Score: 0.956675089083096
   University Score: 4
   Employer Score: 6

5. Company 3
   Funding: $8501689
   Founder Location: 1
   Founder 