In [None]:
import os
from openai import AzureOpenAI
from azure.cosmos import CosmosClient, PartitionKey, exceptions
import pandas as pd
from uuid import uuid4
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

2025-05-29 13:29:16,392 - INFO - HTTP Request: POST https://linkedinsalesnav2.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"


[-0.006964331958442926, -0.005251791328191757, 0.011954205110669136, -0.02494264952838421, -0.024741174653172493, 0.03991898521780968, -0.010120779275894165, -0.009429046884179115, -0.013277227059006691, -0.009771555662155151, -0.011840036138892174, 0.007776949089020491, -0.01418386586010456, 0.007703074719756842, 0.01010734774172306, -0.005124190356582403, 0.022807011380791664, -0.0016210372559726238, 0.01505692582577467, -0.010187937878072262, 0.004848840646445751, 0.012464608997106552, 0.004774966277182102, 0.010684910230338573, -0.0064975805580616, -0.00044912216253578663, 0.005570793990045786, -0.012518336065113544, 0.016346368938684464, 0.004486185032874346, 0.006709129549562931, -0.007078501395881176, -0.015043494291603565, -0.006581528577953577, -0.018683481961488724, 0.004120171070098877, 0.00326054310426116, -0.018965547904372215, 0.03051680140197277, -0.00747473631054163, 0.008132888935506344, 0.009482773952186108, -0.0011030776659026742, -0.00041617255192250013, -0.00869030

In [None]:
cosmos_url =  os.getenv("COSMOS_ENDPOINT")
cosmos_key =  os.getenv("COSMOS_KEY")
cosmos_database_name = "linkedin_data"
cosmos_container_name = "linkedin_data_vector"

cosmos_client = CosmosClient(cosmos_url, cosmos_key)

try:
    cosmos_database = cosmos_client.create_database_if_not_exists(
        id=cosmos_database_name
    )
except exceptions.CosmosResourceExistsError:
    cosmos_database = cosmos_client.get_database_client(cosmos_database_name)

indexing_policy = {
    "includedPaths": [{"path": "/*"}],
    "excludedPaths": [{"path": '/"_etag"/?', "path": "/request_vector/*"}],
    "vectorIndexes": [{"path": "/request_vector", "type": "diskANN"}],
}

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path": "/request_vector",
            "dataType": "float32",
            "distanceFunction": "cosine",
            "dimensions": 1536,  # This is the amount of dimensions for the text-embedding-ada-002 model
        }
    ]
}

try:
    cosmos_container = cosmos_database.create_container_if_not_exists(
        id=cosmos_container_name,
        partition_key=PartitionKey(path="/company"),
        indexing_policy=indexing_policy,
        vector_embedding_policy=vector_embedding_policy,
    )
    print(f"Container {cosmos_container_name} created")
except exceptions.CosmosHttpResponseError as e:
    print(f"Error creating container: {str(e)}")

2025-05-29 13:29:16,485 - INFO - Request URL: 'https://linkedinsales.documents.azure.com:443/'
Request method: 'GET'
Request headers:
    'Cache-Control': 'no-cache'
    'x-ms-version': 'REDACTED'
    'x-ms-documentdb-query-iscontinuationexpected': 'REDACTED'
    'x-ms-date': 'REDACTED'
    'authorization': 'REDACTED'
    'Accept': 'application/json'
    'Content-Length': '0'
    'User-Agent': 'azsdk-python-cosmos/4.9.0 Python/3.13.1 (Windows-11-10.0.26100-SP0)'
No body was attached to the request
2025-05-29 13:29:17,603 - INFO - Response status: 200
Response headers:
    'Content-Length': '1616'
    'Date': 'Thu, 29 May 2025 05:29:17 GMT'
    'Content-Type': 'application/json'
    'Server': 'Microsoft-HTTPAPI/2.0'
    'x-ms-gatewayversion': 'REDACTED'
    'Cache-Control': 'no-store, no-cache'
    'Pragma': 'no-cache'
    'x-ms-max-media-storage-usage-mb': 'REDACTED'
    'x-ms-media-storage-usage-mb': 'REDACTED'
    'x-ms-databaseaccount-consumed-mb': 'REDACTED'
    'x-ms-databaseaccou

Container linkedin_data_vector created


In [27]:
def generate_openai_embeddings(input_text):
    client = AzureOpenAI(
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version = "2025-01-01-preview",
    azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
    )

    response = client.embeddings.create(
        input = input_text,
        model= "text-embedding-ada-002"
    )

    return response.data[0].embedding

In [None]:
df = pd.read_csv("new_output.csv")
logger.info(f"Read {len(df)} records from CSV")
        
# Clean the data - fill NaN values
df = df.fillna("")

# Process and upload each record
successful_uploads = 0
failed_uploads = 0

for _, row in df.iterrows():
    try:
        # Create document structure
        document = {
            'id': str(uuid4()),  # Generate unique ID
            'full_name': row['full_name'],
            'full_name_url': row['full_name_url'],
            'role': row['role'],
            'company': row['company'] if row['company'] else "Unknown",  # Ensure partition key exists
            'time': int(row['time']) if row['time'] and pd.notna(row['time']) else 0,
            'activity': row['activity'],
            'interests': row['interests'],
            'experience_overview': row['experience_overview'],
            'experience_details': row['experience_details'],
            "request_vector": generate_openai_embeddings(row['company'])
        }
        
        # Upload document to CosmosDB
        cosmos_container.create_item(body=document)
        successful_uploads += 1
        
        # Log progress for every 10 records
        if successful_uploads % 10 == 0:
            logger.info(f"Progress: {successful_uploads} records uploaded")
        
    except Exception as e:
        logger.error(f"Error uploading record for {row['full_name']}: {str(e)}")
        failed_uploads += 1

logger.info(f"Import completed: {successful_uploads} records uploaded successfully, {failed_uploads} records failed")


2025-05-29 13:29:20,283 - INFO - Read 3568 records from CSV
2025-05-29 13:29:22,542 - INFO - HTTP Request: POST https://linkedinsalesnav2.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2025-01-01-preview "HTTP/1.1 200 OK"
2025-05-29 13:29:22,549 - INFO - Request URL: 'https://linkedinsales-westus3.documents.azure.com:443/dbs/linkedin_data/colls/linkedin_data_vector/docs/'
Request method: 'POST'
Request headers:
    'Cache-Control': 'no-cache'
    'x-ms-version': 'REDACTED'
    'x-ms-documentdb-query-iscontinuationexpected': 'REDACTED'
    'x-ms-consistency-level': 'REDACTED'
    'x-ms-session-token': 'REDACTED'
    'x-ms-documentdb-partitionkey': 'REDACTED'
    'x-ms-date': 'REDACTED'
    'authorization': 'REDACTED'
    'Content-Type': 'application/json'
    'Accept': 'application/json'
    'x-ms-cosmos-intended-collection-rid': 'REDACTED'
    'Content-Length': '33295'
    'User-Agent': 'azsdk-python-cosmos/4.9.0 Python/3.13.1 (Windows-11-10.0.26100-S