In [1]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import FewShotChatMessagePromptTemplate
from langchain_openai import ChatOpenAI
import os

from langchain_chroma import Chroma
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_openai import OpenAIEmbeddings

from sklearn.metrics import mean_squared_error 

import pandas as pd
import numpy as np
import re
import neptune

from dotenv import load_dotenv

load_dotenv()

True

# Example

In [3]:
prompt = ChatPromptTemplate.from_template("tell me a short joke about {topic}")
model = ChatOpenAI(model="gpt-3.5-turbo-0125", openai_api_key=os.getenv('OPENAI_API_KEY'))
output_parser = StrOutputParser()

chain = prompt | model | output_parser

chain.invoke({"topic": "bread"})

'Why did the loaf of bread go to therapy? It had a lot of emotional baggage!'

# Parameters

In [4]:
post_type = 'post_travel' # 'post_travel' or 'post_abortion'
narcism_type = 'adm' # 'riv' or 'adm'
# model chosen from https://platform.openai.com/docs/models/continuous-model-upgrades
model_used = 'gpt-3.5-turbo-0125' # 'gpt-3.5-turbo-0125', 'gpt-4-1106-preview' or 'gpt-4-turbo-2024-04-09'
iterations = 10
number_of_shots = 5 # somewhere between 3 and 10
scale_to_int = False
if scale_to_int:
    model_role = "You are a psychologist and you are assessing a patient's Narcissism. The patient is talking about their recent travel. Return only int number between 1 and 56."
else:
    model_role = "You are a psychologist and you are assessing a patient's Narcissism. The patient is talking about their recent travel. Return only float number between 1 and 6."
train_path = "../data/split/train.csv"
validate_path = "../data/split/validate.csv"


# Code

Here we used the most basic implementation, there is also option to use Dynamic few-shot prompting, but to my knowledge is not needed is this context as we have only one type of posts.

In [5]:
# Get split data using pandas
df = pd.read_csv(train_path)

# Get the dictionary of all
example = df[[post_type,narcism_type]] #.iloc[0:3*number_of_shots]

example = example.to_dict(orient='records')

# Change the value name
for i in range(len(example)):
    example[i]['post'] = example[i].pop(post_type)
    example[i]['narcissism'] = example[i].pop(narcism_type)

example

[{'post': 'I wish I could travel 24/7 and get paid for it',
  'narcissism': 1.444},
 {'post': "Vacations are pricey these days but so worth it! I had the most amazing weekend at ABC resort. Everything about this place screams relaxation and luxury. I'm definitely going back next year. Would you like to come with me?",
  'narcissism': 3.889},
 {'post': 'I recently visited beautiful Stratford upon Avon as a pit-stop on my way to Minehead, Somerset. I made a point to leave my immediate surroundings and find the birthplace of Shakespeare. I found it interesting but ultimately over-commercialised.',
  'narcissism': 3.444},
 {'post': "I have just visited Marrakesh.The scenery is like being in Mars.The soil is do red and there's not a person around for miles. Then you will come across a shepherd all alone with his flock. It makes you wonder how he gets food.The transport there is mainly donkey and cart.",
  'narcissism': 3.667},
 {'post': "I travel a lot for work, and I get to see all sorts o

In [6]:
#from langchain_community.vectorstores import Neo4jVector #later try this instead of chroma; this seams to be an online database tool

example_selector = SemanticSimilarityExampleSelector.from_examples(
    # This is the list of examples available to select from.
    example,
    # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
    OpenAIEmbeddings(),
    # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
    Chroma,
    # This is the number of examples to produce.
    k=number_of_shots,
    #what will be the key
    input_keys=["post"],
)

In [7]:
model = ChatOpenAI(model=model_used, openai_api_key=os.getenv('OPENAI_API_KEY'))


In [8]:
# This is a prompt template used to format each example.
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{post}"),
        ("ai", "narcissism: {narcissism}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    # Which variable(s) will be passed to the example selector.
    input_variables=["post"],
    example_selector=example_selector,
    example_prompt=example_prompt,
)

print(few_shot_prompt.invoke(input="Amazing trip!").to_messages())

[HumanMessage(content='Had the best time in Tenerife!'), AIMessage(content='narcissism: 1.778'), HumanMessage(content='Visiting Canada was amazing! So many wonderful landscapes and fabulous things to do. Grateful for the opportunity to share this with my family.'), AIMessage(content='narcissism: 2.333'), HumanMessage(content='Finally achieved my dream trip around the north of India, taking in the extraordinary landscape, culture, buildings and people that make up this wonderful country. The icing on the cake was doing this epic adventure by train, a truly memorable way to travel!'), AIMessage(content='narcissism: 1.889'), HumanMessage(content='Just had a great trip to Venice. Enjoyed the sites, the company and of course the food'), AIMessage(content='narcissism: 3.0'), HumanMessage(content='I travelled to Australia recently and it was a fantastic trip visiting family.'), AIMessage(content='narcissism: 2.222')]


#### Use train and validate dataset!!!

In [9]:
# Get split data using pandas
df_val = pd.read_csv(validate_path)

test = df_val[[post_type,narcism_type]].iloc[4] # Test on train dataset
input = test.iloc[0]
print(test[narcism_type])
print(input)

1.667
I had the most amazing time on an tropical beach. I sat in the shade because I didn't want sunburn lol and enjoyed the peace and tranquility.


In [10]:
final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Return a narcissism number between 1 and 6."),
        few_shot_prompt,
        ("human", "{input}"),
    ]
)

final_prompt

ChatPromptTemplate(input_variables=['input', 'post'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Return a narcissism number between 1 and 6.')), FewShotChatMessagePromptTemplate(example_selector=SemanticSimilarityExampleSelector(vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x00000268075B5110>, k=5, example_keys=None, input_keys=['post'], vectorstore_kwargs=None), input_variables=['post'], example_prompt=ChatPromptTemplate(input_variables=['narcissism', 'post'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['post'], template='{post}')), AIMessagePromptTemplate(prompt=PromptTemplate(input_variables=['narcissism'], template='narcissism: {narcissism}'))])), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))])

In [11]:
print(final_prompt.invoke({"input": input, "post": input}))

messages=[SystemMessage(content='Return a narcissism number between 1 and 6.'), HumanMessage(content='I recently travelled to my favourite place in the world, Jamaica, where the heat is 30 degrees, the beaches are white and the water is clear and all you have to worry about is getting up to get your next all inclusive frozen alcoholic beverage. 14 nights of pure bliss.'), AIMessage(content='narcissism: 3.778'), HumanMessage(content='Had the best time in Tenerife!'), AIMessage(content='narcissism: 1.778'), HumanMessage(content='I had such a great time travelling! Taking time to really reset my mind and just live in the moment was fantastic. Its so easy to get caught up in the day to day routine of life!'), AIMessage(content='narcissism: 3.333'), HumanMessage(content='Had the most amazing time in Tenerife. Made the best memories and enjoyed lots of Sun, sea and sand. Can’t wait to come back!'), AIMessage(content='narcissism: 3.222'), HumanMessage(content='Lovely holiday with my partner o

In [12]:
chain = final_prompt | model

ai_message = chain.invoke({"input": input, "post": input})
ai_message


AIMessage(content='narcissism: 3.0', response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 316, 'total_tokens': 325}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-f8a231a6-d1dd-40b3-9707-fad171d6784f-0')

# Analyze the results

In [13]:
r = ai_message.content
match = re.search(r'\d+\.\d+', r)
if match:
    response = float(match.group())
else:
    response = None
response

3.0

In [14]:
test.iloc[1]

1.667

In [15]:
y_pred = []
y_true = []
y_pred.append(response)
y_true.append(test.iloc[1])

In [16]:
mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
mse

1.776889

# Implementation

In [24]:
# functions

# get random x posts
def create_example_selector(path, post_type, narcism_type, scale_to_int = False):
    
    df = pd.read_csv(path)
    example = df[[post_type,narcism_type]]
    example = example.to_dict(orient='records')

    # Change the value name
    for i in range(len(example)):
        example[i]['post'] = example[i].pop(post_type)
        if scale_to_int:
            example[i]['narcissism'] = int(example[i].pop(narcism_type) * 9)
        else:
            example[i]['narcissism'] = example[i].pop(narcism_type)


    # Create a selector
    example_selector = SemanticSimilarityExampleSelector.from_examples(
        # This is the list of examples available to select from.
        example,
        # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
        OpenAIEmbeddings(),
        # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
        Chroma,
        # This is the number of examples to produce.
        k=number_of_shots,
        #what will be the key
        input_keys=["post"],
    )

    return example_selector

# create a few shot prompt
def create_few_shot_prompt(example_selector):


    # This is a prompt template used to format each example.
    example_prompt = ChatPromptTemplate.from_messages(
        [
            ("human", "{post}"),
            ("ai", "narcissism: {narcissism}"),
        ]
    )
    few_shot_prompt = FewShotChatMessagePromptTemplate(
        # Which variable(s) will be passed to the example selector.
        input_variables=["post"],
        example_selector=example_selector,
        example_prompt=example_prompt,
    )
    
    return few_shot_prompt

# create a final prompt
def create_final_prompt(few_shot_prompt,model_role):

    final_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", model_role),
            few_shot_prompt,
            ("human", "{input}"),
        ]
    )
    return final_prompt

# get float number from a string
def get_float(text):
    # Use regular expression to find numerical value
    match = re.search(r'\d+\.\d+', text)
    if match:
        float_number = float(match.group())
        if float_number is not None:
            return float_number
        else:
            print(f"Wrong input: {text}")
    else:
        return None

# get the response
def get_response(final_prompt, model, input):
    chain = final_prompt | model
    ai_message = chain.invoke({"input": input, "post": input})
    response = ai_message.content
    return response

# get the mean squared error
def get_mse(y_pred, y_true):
    mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
    return mse

Add Neptune experiment observation

In [27]:
# Run the functions
run = neptune.init_run(project = os.getenv('NEPTUNE_PROJECT'),
                       api_token = os.getenv('NEPTUNE_API_TOKEN'),
                       source_files=["few_shot_test_RAG_crewai.ipynb"],
                       tags=["few-shot-RAG", narcism_type, post_type])

run["type"] = "Few-shot learning"
params = {
    "model": model_used,
    "narc_type": narcism_type,
    "post_type": post_type,
    "prompt": model_role,
    "shots": number_of_shots,
    "scale_to_int": scale_to_int
}
run["model/parameters"] = params # Save the parameters

y_pred = []
y_true = []

test_df = pd.read_csv(validate_path)
testset = test_df[[post_type,narcism_type]]

problems = []
example_selector = create_example_selector(train_path, post_type, narcism_type, scale_to_int= scale_to_int)
few_shot_prompt = create_few_shot_prompt(example_selector) 
for i in range(test.shape[0]):
    input = testset.iloc[i]
    final_prompt = create_final_prompt(few_shot_prompt,model_role)
    response_str = get_response(final_prompt, model, input.get(post_type))
    response = get_float(response_str)

    if response is not None: # Check if the model returned a number
        y_pred.append(response)
        if scale_to_int:
            y_true.append(int(input.get(narcism_type)*9))
        else:
            y_true.append(input.get(narcism_type))
    else: # Else save the prompt that caused the error
        row_to_add = {'post': input.get(post_type), 'post_type': post_type, 'model_role': model_role, 'date': pd.Timestamp.now()}
        problems.append(row_to_add)


mse = get_mse(y_pred, y_true) # Calculate the mean squared error
print(mse)
run["mse"] = mse
run.stop() # Stop the run

NeptuneInvalidApiTokenException: 

----NeptuneInvalidApiTokenException------------------------------------------------

The provided API token is invalid.
Make sure you copied and provided your API token correctly.

You can get it or check if it is correct here:
    - https://app.neptune.ai/get_my_api_token

There are two options to add it:
    - specify it in your code
    - set it as an environment variable in your operating system.

CODE
Pass the token to the init_run() function via the api_token argument:
    neptune.init_run(project='WORKSPACE_NAME/PROJECT_NAME', api_token='YOUR_API_TOKEN')

ENVIRONMENT VARIABLE (Recommended option)
or export or set an environment variable depending on your operating system:

    Linux/Unix
    In your terminal run:
        export NEPTUNE_API_TOKEN="YOUR_API_TOKEN"

    Windows
    In your CMD run:
        set NEPTUNE_API_TOKEN="YOUR_API_TOKEN"

and skip the api_token argument of the init_run() function:
    neptune.init_run(project='WORKSPACE_NAME/PROJECT_NAME')

You may also want to check the following docs page:
    - https://docs.neptune.ai/setup/setting_api_token/

Need help?-> https://docs.neptune.ai/getting_help


In [68]:
# Make a dataframe of the problems
problems_df = pd.DataFrame(problems)
problems_df.to_csv("../data/responses/few_shot.csv", mode='a', index=False, header=False)

problems

[]