### Importing Libraries and Loading Keys

In [6]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    ScoringProfile,
    TextWeights
)

from dotenv import dotenv_values
from openai import OpenAI

import csv

In [7]:
env_name = ".env"
config = dotenv_values(env_name)           

# Azure AI Search Details
service_endpoint = config["search_endpoint"]
index_name = config["index_name"]
regions_index = config["regions_index"]
search_key = config["search_api_key"]
stages = config["stages"]

# OpenAI Details
openapi_key = config["openapi_key"]


### Prepare Data Source

In [8]:
# Converts the csv file into a format that can be added to the index

startupscsv = config["startupscsv"]
with open(startupscsv,"r") as csvFile, open("startups.txt","w") as txtFile:
    header = ("Name","StartupName","Location","StartupStage","Industry")
    reader = csv.DictReader(csvFile, header)
    writer = csv.writer(txtFile)
    next(reader, None)
    txtFile.write("["+"\n")

    for row in reader: 
        newDict = {"StartupID":f"{reader.line_num-1}"}
        newDict.update(row)
        
        newDict_location = newDict["Location"]
        newDict_seedstage = newDict["StartupStage"]
        newDict_industry = newDict["Industry"]
        newDict["StageNumber"] = ""
        newDict["LocationVector"] = ""
        newDict["StartupStageVector"] = ""
        newDict["IndustryVector"] = ""

        toWrite = f"{newDict}"
        startup = toWrite.replace(
            "'StageNumber': ''",f"'StageNumber': get_stage('{newDict_seedstage}')").replace(
            "'LocationVector': ''",f"'LocationVector': get_embedding('{newDict_location}')" ).replace(
            "'StartupStageVector': ''",f"'StartupStageVector': get_embedding('{newDict_seedstage} Stage')").replace(
            "'IndustryVector': ''",f"'IndustryVector': get_embedding('{newDict_industry}')")
        txtFile.write(startup + ',\n')
    
    txtFile.write("]")


In [10]:
# To use your own data, replace the value of the startups variable
# inside the get_startups() function with the data generated below
# otherwise, the pre-loaded values are uploaded

startups_data = []
with open("./startups.txt","r") as file:
    startups_data = file.read()
# print(startups_data)

### Functions

In [11]:
def get_embedding(text, model="text-embedding-ada-002"):
   client = OpenAI(api_key=openapi_key)
   
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [37]:
stage_dict = {}
for i, stage in enumerate(stages.split(",")):
    stage_dict[stage.lower()] = i

def get_stage(startup_stage: str):
    stage_lower = startup_stage.lower()
    if stage_lower in stage_dict.keys():
        return stage_dict[stage_lower]
    else:
        raise Exception("Please check your startup stage inputs")

In [13]:
def get_startups_index(name: str):
    fields = [
        SimpleField(
            name="StartupID", 
            type=SearchFieldDataType.String, 
            key=True
        ),
        SearchableField(
            name="Name",
            type=SearchFieldDataType.String,
        ),
        SearchableField(
            name="StartupName",
            type=SearchFieldDataType.String,
        ),
        SearchableField(
            name="Location",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
            normalizer='lowercase'
        ),
        SearchableField(
            name="StartupStage",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
            normalizer='lowercase'
        ),
        SearchableField(
            name="Industry",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
            normalizer='lowercase'
        ),
        SearchField(
            name="StageNumber",
            type=SearchFieldDataType.Int32,
            filterable=True,
            facetable=True
        ),
        SearchField(
            name="LocationVector", 
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile_name="slt-vector-config",
        ),
        SearchField(
            name="StartupStageVector", 
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile_name="slt-vector-config",
        ),
        SearchField(
            name="IndustryVector", 
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile_name="slt-vector-config",
        ),
    ]

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="slt-vector-config", 
                algorithm_configuration_name="slt-algorithms-config"
                )],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="slt-algorithms-config")]
    )

    semantic_search = SemanticSearch(
        configurations=[
            SemanticConfiguration(
                name="slt-semantic-config",
                prioritized_fields=SemanticPrioritizedFields(
                    content_fields=[
                        SemanticField(field_name="Location"),
                        SemanticField(field_name="StartupStage"),
                        SemanticField(field_name="Industry")]))
        ]
    )
    # scoring_profile = ScoringProfile(
    #     name="slt-scoring-profile",
    #     text_weights=TextWeights(weights={"Location": 3, "Industry": 3,"StartupStage":3,})
    # )
    # scoring_profiles = []
    # scoring_profiles.append(scoring_profile)


    return SearchIndex(name=name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)

In [14]:
def get_regions_index(name: str):
    fields = [
        SimpleField(
            name="id", 
            type=SearchFieldDataType.String, 
            key=True
        ),
        SearchableField(
            name="grouping_code",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
            normalizer='lowercase'
        ),
        SearchableField(
            name="grouping_name",
            type=SearchFieldDataType.String,
            sortable=True,
            filterable=True,
            facetable=True,
            normalizer='lowercase'
        ),
        SimpleField(
            name="countries",
            type=SearchFieldDataType.Collection(SearchFieldDataType.String),
        ),
    ]

    semantic_search = SemanticSearch(
        configurations=[
            SemanticConfiguration(
                name="slt-regions-semantic-config",
                prioritized_fields=SemanticPrioritizedFields(
                    content_fields=[SemanticField(field_name="grouping_name")]))
        ]
    )


    return SearchIndex(name=name, fields=fields, semantic_search=semantic_search)

In [15]:
def get_startups():
  startups = [
{'StartupID': '1', 'Name': 'Aliah', 'StartupName': 'Startup 1', 'Location': 'Singapore', 'StartupStage': 'Seed', 'Industry': 'Material Technology', 'StageNumber': get_stage('Seed'), 'LocationVector': get_embedding('Singapore'), 'StartupStageVector': get_embedding('Seed Stage'), 'IndustryVector': get_embedding('Material Technology')},
{'StartupID': '2', 'Name': 'Bailey', 'StartupName': 'Startup 2', 'Location': 'Ireland', 'StartupStage': 'Seed', 'Industry': 'Material Technology', 'StageNumber': get_stage('Seed'), 'LocationVector': get_embedding('Ireland'), 'StartupStageVector': get_embedding('Seed Stage'), 'IndustryVector': get_embedding('Material Technology')},
{'StartupID': '3', 'Name': 'Clara', 'StartupName': 'Startup 3', 'Location': 'South Korea', 'StartupStage': 'Pre-A', 'Industry': 'Women issues, Femtech', 'StageNumber': get_stage('Pre-A'), 'LocationVector': get_embedding('South Korea'), 'StartupStageVector': get_embedding('Pre-A Stage'), 'IndustryVector': get_embedding('Women issues, Femtech')},
{'StartupID': '4', 'Name': 'Diane', 'StartupName': 'Startup 4', 'Location': 'Africa', 'StartupStage': 'Seed', 'Industry': 'Logistic technology', 'StageNumber': get_stage('Seed'), 'LocationVector': get_embedding('Africa'), 'StartupStageVector': get_embedding('Seed Stage'), 'IndustryVector': get_embedding('Logistic technology')},
{'StartupID': '5', 'Name': 'Eve', 'StartupName': 'Startup 5', 'Location': 'Turkey', 'StartupStage': 'Pre-A', 'Industry': 'Sustainable technology', 'StageNumber': get_stage('Pre-A'), 'LocationVector': get_embedding('Turkey'), 'StartupStageVector': get_embedding('Pre-A Stage'), 'IndustryVector': get_embedding('Sustainable technology')},
{'StartupID': '6', 'Name': 'Farida', 'StartupName': 'Startup 6', 'Location': 'Egypt', 'StartupStage': 'Seed', 'Industry': 'Healthcare Technology', 'StageNumber': get_stage('Seed'), 'LocationVector': get_embedding('Egypt'), 'StartupStageVector': get_embedding('Seed Stage'), 'IndustryVector': get_embedding('Healthcare Technology')},
{'StartupID': '7', 'Name': 'Greta', 'StartupName': 'Startup 7', 'Location': 'United Kingdom', 'StartupStage': 'Pre-A', 'Industry': 'Material Technology, Deeptech', 'StageNumber': get_stage('Pre-A'), 'LocationVector': get_embedding('United Kingdom'), 'StartupStageVector': get_embedding('Pre-A Stage'), 'IndustryVector': get_embedding('Material Technology, Deeptech')},
{'StartupID': '8', 'Name': 'Hailey', 'StartupName': 'Startup 8', 'Location': 'Thailand', 'StartupStage': 'Seed', 'Industry': 'Human Resource Technology', 'StageNumber': get_stage('Seed'), 'LocationVector': get_embedding('Thailand'), 'StartupStageVector': get_embedding('Seed Stage'), 'IndustryVector': get_embedding('Human Resource Technology')},
{'StartupID': '9', 'Name': 'Irina', 'StartupName': 'Startup 9', 'Location': 'Australia', 'StartupStage': 'Seed', 'Industry': 'Education Technology', 'StageNumber': get_stage('Seed'), 'LocationVector': get_embedding('Australia'), 'StartupStageVector': get_embedding('Seed Stage'), 'IndustryVector': get_embedding('Education Technology')},
{'StartupID': '10', 'Name': 'Jina', 'StartupName': 'Startup 10', 'Location': 'India', 'StartupStage': 'Seed', 'Industry': 'Medical Technology', 'StageNumber': get_stage('Seed'), 'LocationVector': get_embedding('India'), 'StartupStageVector': get_embedding('Seed Stage'), 'IndustryVector': get_embedding('Medical Technology')},
]
  return startups

In [16]:
def vector_search(query):
    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(search_key))
    vector_query = VectorizedQuery(vector=get_embedding(query), k_nearest_neighbors=10, fields="IndustryVector")

    results = search_client.search(
        query,
        vector_queries=[vector_query],
        # filter="search.ismatchscoring('south')",
        query_type="semantic",
        semantic_configuration_name="slt-semantic-config",
        select=["Name","StartupName","Location","StartupStage","Industry"]
    )
    for result in results:
        print(result)
    

In [17]:
def regions_search(query):
    search_client = SearchClient(service_endpoint, regions_index, AzureKeyCredential(search_key))

    results = search_client.search(
        query,
        top=1,
        query_type="semantic",
        semantic_configuration_name="slt-regions-semantic-config",
        select=["grouping_name","countries"]
    )
    for result in results:
        print(result)

### Execute

In [39]:
# edit regions.py to add or remove country regions/continents
from regions import regions

if __name__ == "__main__":
    credential = AzureKeyCredential(search_key)
    index_client = SearchIndexClient(service_endpoint, credential)
    
    # Create startups index
    index = get_startups_index(index_name)
    index_client.create_index(index)
   
    # Upload startups documents
    client = SearchClient(service_endpoint, index_name, credential)
    startups = get_startups() 
    client.upload_documents(documents=startups)

    # Create regions index
    index_r = get_regions_index(regions_index)
    index_client.create_index(index_r)
   
    # Upload regions documents
    client = SearchClient(service_endpoint, regions_index, credential)
    client.upload_documents(documents=regions)

    # Search the index
    vector_search("Deeptech, Medtech")

    # Search regions index
    regions_search("Western")
