## Setup

Required libs:
* ibm-watsonx-ai for accessing the watsonx Granite language model.
* llama-index-llms-ibm for communicating with watsonx.ai models using the LlamaIndex and watsonx.ai's LLMs API.
* llama-index-embeddings-ibm for using watsonx.ai's embedding models.
* llama-index for using LlamaIndex framework relevant features.

Install required libs:

In [None]:
%%capture
!pip install ibm-watsonx-ai==1.1.2
!pip install --user llama-index==0.11.8
!pip install llama-index-core==0.11.8
!pip install llama-index-llms-ibm==0.2.0
!pip install llama-index-embeddings-ibm==0.2.0
!pip install llama-index-readers-web==0.2.2
!pip install llama-hub==0.0.79.post1
!pip install requests==2.32.2

Import required libs:

In [None]:
import os
import time
import json
import requests
import logging
import sys
import warnings

warnings.filterwarnings('ignore')

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
logging.getLogger("ibm_watsonx_ai").setLevel(logging.ERROR)

# IBM Watsonx API Client and Credentials handling
from ibm_watsonx_ai import APIClient, Credentials
from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes, DecodingMethods
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

# Core components for handling documents, embeddings, and indices
from llama_index.core import Document, VectorStoreIndex, PromptTemplate, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

# LlamaIndex IBM-specific components for LLMs and embeddings
from llama_index.llms.ibm import WatsonxLLM
from llama_index.embeddings.ibm import WatsonxEmbeddings

# For displaying rich content in Jupyter notebooks (Markdown, etc.)
from IPython.display import display, Markdown

# Disable warnings for a cleaner notebook or console experience
def warn(*args, **kwargs):
    pass
warnings.warn = warn

## Extracting LinkedIn profile data

[ProxyCurl](https://nubela.co/proxycurl/) is a robust API that allows developers to extract information from various websites, including social media platforms like LinkedIn
There is no free version for ProxyCurl. Alternatively use mocked data.

While LlamaIndex provides a built-in Web Page Reader for reading websites, it cannot extract LinkedIn data. To overcome this, we utilize the ProxyCurl API, which provides a reliable and efficient way to extract LinkedIn profiles data.



In [None]:
# with ProxyCurl
PROXYCURL_API_KEY = "# Replace with your API Key" 

# with mocked data
def extract_linkedin_profile(linkedin_profile_url: str, PROXYCURL_API_KEY: str = None, mock: bool = False) -> dict:
    """Extract LinkedIn profile data using Proxycurl API or loads a premade JSON file if mock is True."""

    start_time = time.time()
    
    if mock:
        print("Using mock data from a premade JSON file...")
        linkedin_profile_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ZRe59Y_NJyn3hZgnF1iFYA/linkedin-profile-data.json"
        response = requests.get(linkedin_profile_url, timeout=30)
    else:
        # Ensure API key is provided when mock is False
        if not PROXYCURL_API_KEY:
            raise ValueError("PROXYCURL_API_KEY is required when mock is set to False.")
        
        print("Starting to extract the LinkedIn profile...")

        # Set up the API endpoint and headers
        api_endpoint = "https://nubela.co/proxycurl/api/v2/linkedin"
        headers = {
            "Authorization": PROXYCURL_API_KEY
        }

        # Prepare parameters for the request
        params = {
            "url": linkedin_profile_url,
            "fallback_to_cache": "on-error",
            "use_cache": "if-present",
            "skills": "include",
            "inferred_salary": "include",
            "personal_email": "include",
            "personal_contact_number": "include"
        }

        print(f"Sending API request to Proxycurl at {time.time() - start_time:.2f} seconds...")

        # Send API request
        response = requests.get(api_endpoint, headers=headers, params=params, timeout=10)
    
    print(f"Received response at {time.time() - start_time:.2f} seconds...")

    # Check if response is successful
    if response.status_code == 200:
        # Clean the data, remove empty values and unwanted fields
        data = response.json()
        data = {
            k: v
            for k, v in data.items()
            if v not in ([], "", None) and k not in ["people_also_viewed", "certifications"]
        }

        if data.get("groups"):
            for group_dict in data.get("groups"):
                group_dict.pop("profile_pic_url")

        return data
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        print(f"Response: {response.text}")
        return {}

profile_url = "https://www.linkedin.com/in/leonkatsnelson/"

# with ProxyCurl
profile_data = extract_linkedin_profile(linkedin_profile_url=profile_url, PROXYCURL_API_KEY=PROXYCURL_API_KEY, mock=False)
# with mocked data
profile_data = extract_linkedin_profile(linkedin_profile_url="dummy_url", mock=True)

profile_data

Splitting into nodes

In [None]:
def split_profile_data(profile_data):
    """Splits the LinkedIn profile JSON data into nodes."""
    try:
        # The extracted LinkedIn profile data is returned in JSON format. To work with this data more easily, 
        # we first convert it into a text string using the json.dumps() function. 
        # This transformation allows us to manipulate the data in subsequent steps, 
        # such as splitting it for further processing.
        profile_json = json.dumps(profile_data)

        # Once the JSON string is created, it is wrapped inside a `Document` object. 
        # This step is necessary because the `Document` format is required for the splitting 
        # and processing steps that follow. The `Document` serves as a container for the profile data, 
        # enabling structured handling of the information.
        document = Document(text=profile_json)

        # To break down the document into smaller parts, we utilize the `SentenceSplitter` class. 
        # This tool splits the document into manageable chunks, called `nodes`. 
        # The parameter `chunk_size=500` is used, meaning each node will contain approximately 500 characters. 
        # This ensures that each chunk is small enough for efficient processing while maintaining coherence for the model to understand.
        splitter = SentenceSplitter(chunk_size=500)

        # Once the document is split, the function returns a list of nodes. 
        # Each node represents a portion of the original LinkedIn profile data, 
        # and these chunks will later be stored in a vector database. 
        # This step is crucial for enabling efficient indexing and retrieval in future operations.
        nodes = splitter.get_nodes_from_documents([document])
        return nodes
        
    # The entire function is wrapped in a `try-except` block to manage potential errors. 
    # If something goes wrong during the process, the function catches the error, 
    # prints an error message for debugging, and returns an empty list. 
    # This helps ensure the program remains stable, even when issues arise.
    except Exception as e:
        print(f"Error in split_profile_data: {e}")
        return []

nodes = split_profile_data(profile_data)

print(f"Number of nodes created: {len(nodes)}")

# Print the first few nodes for inspection
for i, node in enumerate(nodes[:5]):
    print(f"\nNode {i+1}:")
    print(node.get_text())

Indexing and Storing

In [None]:
def create_watsonx_embedding():
    """Creates an IBM Watsonx Embedding model for vector representation."""
    watsonx_embedding = WatsonxEmbeddings(
        model_id="ibm/slate-125m-english-rtrvr",
        url="https://us-south.ml.cloud.ibm.com",
        project_id="skills-network",
        truncate_input_tokens=3,
    )
    return watsonx_embedding

def vector_database(nodes):
    """Stores the document chunks (nodes) in a vector database."""
    try:
        # We first call the `create_watsonx_embedding()` function to 
        # get the IBM watsonx embedding model, which will embed our nodes.
        embedding_model = create_watsonx_embedding()

        # The VectorStoreIndex class is used to embed the nodes and 
        # store the resulting vector representations in a vector database.
        index = VectorStoreIndex(
            nodes=nodes, # These are the chunks of text (or nodes) that were created in the previous splitting step.
            embed_model=embedding_model, # The embedding model used to convert text into vectors.
            show_progress=False # This hides the progress bar during the embedding process, 
                                # but you can set this to True if you want to track the embedding progress.
        )
        return index
    # The entire indexing process is wrapped in a `try-except` block to catch 
    # and display any errors that may occur during the embedding or storing process.
    except Exception as e:
        print(f"Error in vector_database: {e}")
        return None

vectordb_index = vector_database(nodes)

# test by indexing
if vectordb_index:
    print("Vector database created successfully.")
else:
    print("Failed to create vector database.")

# Inspect embeddings
vector_store = vectordb_index._storage_context.vector_store
node_ids = list(vectordb_index.index_struct.nodes_dict.keys())
missing_embeddings = False

for node_id in node_ids:
    embedding = vector_store.get(node_id)
    if embedding is None:
        print(f"Node ID {node_id} has a None embedding.")
        missing_embeddings = True
    else:
        print(f"Node ID {node_id} has a valid embedding.")

if missing_embeddings:
    print("Some node embeddings are missing. Please check the embedding generation step.")
else:
    print("All node embeddings are valid.")

Quering with prompt templates

In [None]:
initial_facts_template = """
You are an AI assistant that provides detailed answers based on the provided context.

Context information is below:

{context_str}

Based on the context provided, list 3 interesting facts about this person's career or education.

Answer in detail, using only the information provided in the context.
"""
initial_facts_prompt = PromptTemplate(template=initial_facts_template)


user_question_template = """
You are an AI assistant that provides detailed answers to questions based on the provided context.

Context information is below:

{context_str}

Question: {query_str}

Answer in full details, using only the information provided in the context.If the answer is not available in the context, say "I don't know. The information is not available on the LinkedIn page."
"""
user_question_prompt = PromptTemplate(template=user_question_template)

def generate_initial_facts(index):
    """Generates 3 interesting facts about the person's career or education."""

    # Set the temperature for the model's response generation (controls creativity of the response).
    temperature = 0.0
    # Set the maximum number of new tokens (words) to generate in the response.
    max_new_tokens = 500
    additional_params = {
        "decoding_method": "sample",  # Sample from the probability distribution of tokens (instead of greedy decoding).
        "min_new_tokens": 1,          # Minimum number of tokens to generate.
        "top_k": 50,                  # Consider the top 50 most likely tokens at each step in the generation process.
        "top_p": 1,                   # Use nucleus sampling with a probability cutoff at 1 (i.e., consider all tokens).
    }

    # Initialize the WatsonxLLM instance for the ibm/granite-3-8b-instruct model
    watsonx_llm = WatsonxLLM(
        model_id="ibm/granite-3-8b-instruct",
        url="https://us-south.ml.cloud.ibm.com", 
        project_id="skills-network",              
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        additional_params=additional_params,
    )
    
    # Create a query engine using the initial facts prompt
    query_engine = index.as_query_engine(
        streaming=False,                        # Disable streaming, wait for full response at once.
        similarity_top_k=5,                     # Use top 5 similar items from the index for query.
        llm=watsonx_llm,                        # Pass the Watsonx LLM with the IBM Granite model.
        text_qa_template=initial_facts_prompt    # Use a predefined prompt template to structure the LLM's output.
    )
    
    # Define a query that asks for 3 interesting facts about a person's career or education.
    query = "Provide three interesting facts about this person's career or education."
    
    # Execute the query using the query engine.
    response = query_engine.query(query)
    
    # Extract the actual generated facts from the response object.
    facts = response.response

    # Return the generated facts.
    return facts

from llama_index.llms.ibm import WatsonxLLM

def answer_user_query(index, user_query):
    """Answers the user's question using the vector database and the LLM."""

    try:
        # Set the temperature for controlling the randomness of the LLM's response.
        temperature = 0.0
        # Limit the number of new tokens generated in the response to 250.
        max_new_tokens = 250
        additional_params = {
            "decoding_method": "greedy",  # Greedy decoding for deterministic and predictable response.
            "min_new_tokens": 1,
            "top_k": 50,
            "top_p": 1,
        }
        
        # Initialize the WatsonxLLM instance for the ibm/granite-3-8b-instruct model
        watsonx_llm = WatsonxLLM(
            model_id="ibm/granite-3-8b-instruct",
            url="https://us-south.ml.cloud.ibm.com", 
            project_id="skills-network",              
            temperature=temperature,
            max_new_tokens=max_new_tokens,
            additional_params=additional_params,
        )

        # Retrieve relevant nodes (chunks of data) from the index based on the user query.
        base_retriever = index.as_retriever(similarity_top_k=5)  # Fetch top 5 most relevant items from the index.
        source_nodes = base_retriever.retrieve(user_query)       # Retrieve relevant data chunks based on the query.

        # Build a context string by joining the text from each retrieved node.
        context_str = "\n\n".join([node.node.get_text() for node in source_nodes])
        
        # Create a query engine, specifying how the LLM should answer questions based on user input and the context.
        query_engine = index.as_query_engine(
            streaming=False,                        # Disable streaming, get the complete response all at once.
            similarity_top_k=5,                     # Use the top 5 similar items from the index for the query.
            llm=watsonx_llm,                        # Use the Watsonx LLM with the IBM Granite model.
            text_qa_template=user_question_prompt    # Provide a template to guide the LLM in forming the response.
        )
        
        # Execute the query with the user's question and return the LLM's answer.
        answer = query_engine.query(user_query)
        return answer
    
    except Exception as e:
        # Handle exceptions gracefully and log the error.
        print(f"Error in answer_user_query: {e}")
        return "Failed to get an answer."

# test without vector database
initial_facts = generate_initial_facts(vectordb_index)
print("\nHere are 3 interesting facts about this person:")
print(initial_facts)

user_query = "What is this person's current job title?"
response = answer_user_query(vectordb_index, user_query)
print(response)

## Building a chatbot interface

In [None]:
def chatbot_interface(index):
    """Provides a simple chatbot interface for user interaction."""
    print("\nYou can now ask more in-depth questions about this person. Type 'exit', 'quit' or 'bye' to quit.")
    
    while True:
        user_query = input("You: ")
        if user_query.lower() in ['exit', 'quit', 'bye']:
            print("Bot: Goodbye!")
            break
        
        print("Bot is typing...", end='')
        sys.stdout.flush()
        time.sleep(1)  # Simulate typing delay
        print('\r', end='')
        
        response = answer_user_query(index, user_query)
        print(f"Bot: {response.response.strip()}\n")

def process_linkedin(linkedin_url, PROXYCURL_API_Key=None, mock=False):
    """
    Processes a LinkedIn URL, extracts data from the profile, and interacts with the user.

    Parameters:
    - linkedin_url (str): The LinkedIn profile URL to extract or load mock data from.
    - PROXYCURL_API_Key (str, optional): Proxycurl API key. Required if mock is False.
    - mock (bool, optional): If True, loads mock data from a premade JSON file instead of using the API.
    """
    try:
        # Extract the profile (with or without the API depending on the mock flag)
        profile_data = extract_linkedin_profile(linkedin_url, PROXYCURL_API_Key, mock=mock)
        
        if not profile_data:
            print("Failed to retrieve profile data.")
            return

        # Split the data into nodes
        nodes = split_profile_data(profile_data)
        
        # Store in vector database
        vectordb_index = vector_database(nodes)
        
        # Generate and display the initial facts
        initial_facts = generate_initial_facts(vectordb_index)
        
        print("\nHere are 3 interesting facts about this person:")
        print(initial_facts)
        
        # Start the chatbot interface
        chatbot_interface(vectordb_index)
                
    except Exception as e:
        print(f"Error occurred: {str(e)}")

chatbot_interface(vectordb_index)