### loading the questions

In [16]:
import os
import json
import pandas as pd

questions_paths = [f for f in os.listdir(os.getcwd()) if f.endswith('.json')]
df = ''
try:
    for path in questions_paths:
        with open(path, mode = 'r') as file:
            data = json.load(file)
            if isinstance(df, pd.DataFrame):
                df = pd.concat([df, pd.DataFrame(data)], axis = 0)
            else:
                df = pd.DataFrame(data)
except Exception as e:
    print(e)

In [17]:
print(f"the total number of questions is: {len(df)}")

the total number of questions is: 282


exporting the questions

In [18]:
df.to_csv('questionaire.csv', index = False)

### data modifications to the questions

In [20]:
import pandas as pd
questionaire = pd.read_csv('questionaire.csv')
questionaire.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   questions  282 non-null    object
 1   answers    282 non-null    object
dtypes: object(2)
memory usage: 4.5+ KB


replace the citations

In [21]:
import re
pattern = re.compile(r"\[cite:\s*\d+(?:\s*,\s*\d+)*\]")
questionaire['answers'] = questionaire['answers'].str.replace(pattern, '', regex = True)
questionaire.to_csv('questionaire (no citations).csv', index = False)


### answers from a model with no context

In [None]:
import pandas as pd
from groq import Client
import os

class NoContextModel:
    def __init__(self, api_key):
        self.api_key = api_key
    
        self.client = Client(
            api_key = self.api_key
        )
        
        self.conversation = [
            {
                "role" : "system",
                "content" : "you are a helpful AI assistant expert on Green House Gas Protocols and Environmental regulation"
            },
            {
                "role" : "assistant",
                "content" : "provide a concise answer to the user's question"
            }
        ]
    def generate_answer(self, user_input : str):
        self.conversation.append(
            {
                "role" : "user",
                "content" : user_input
            }
        )
        try:
            chat_completion = self.client.chat.completions.create(
                model = 'gemma2-9b-it',
                messages = self.conversation,
                max_completion_tokens = 200,
                temperature = 0.5
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            print(e)
            return ''
        
test_model = NoContextModel(
    api_key = os.getenv("GROQ_API_KEY")
)

questionaire = pd.read_csv('questionaire (no citations).csv')

questionaire['answers (no context)'] = questionaire['questions'].apply(lambda text: test_model.generate_answer(text))
 

Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jn7qyhdxfbk9rkkm0eqm86pf` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 495775, Requested 4870. Please try again in 1m51.29s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jn7qyhdxfbk9rkkm0eqm86pf` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 495774, Requested 4892. Please try again in 1m54.9836s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error code: 429 - {'error': {'message': 'Rate limit reached for model `gemma2-9b-it` in organization `org_01jn7qyhdxfbk9rkkm0eqm86pf` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used

In [33]:
questionaire.to_csv('partial responses questionaire.csv', index = False)

In [1]:
import pandas as pd
questionaire = pd.read_csv('partial responses questionaire.csv')

In [5]:
from groq import Client
import os
class NoContextModel:
    def __init__(self, api_key):
        self.api_key = api_key
    
        self.client = Client(
            api_key = self.api_key
        )
        
        self.conversation = [
            {
                "role" : "system",
                "content" : "you are a helpful AI assistant expert on Green House Gas Protocols and Environmental regulation"
            },
            {
                "role" : "assistant",
                "content" : "provide a concise answer to the user's question"
            }
        ]
    def generate_answer(self, user_input : str):
        self.conversation.append(
            {
                "role" : "user",
                "content" : user_input
            }
        )
        try:
            chat_completion = self.client.chat.completions.create(
                model = 'gemma2-9b-it',
                messages = self.conversation,
                max_completion_tokens = 200,
                temperature = 0.5
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            print(e)
            return ''
        
test_model = NoContextModel(
    api_key = os.getenv("GROQ_API_KEY")
)


In [30]:
mask = questionaire['answers (no context)'].isnull()
questionaire.loc[mask, 'answers (no context)'] = questionaire.loc[mask, 'questions'].apply(lambda text: test_model.generate_answer(text))

In [32]:
mask = questionaire['answers (no context)'].isnull()
questionaire.to_csv('questionaire (full no context response).csv', index=False)

### generating embeddings from the data

In [3]:
import pandas as pd
questionaire = pd.read_csv('questionaire (full no context response).csv')
questionaire.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   questions             282 non-null    object
 1   answers               282 non-null    object
 2   answers (no context)  257 non-null    object
dtypes: object(3)
memory usage: 6.7+ KB


In [6]:
mask = questionaire['answers (no context)'].isnull()
questionaire.loc[mask, 'answers (no context)'] = questionaire.loc[mask, 'questions'].apply(lambda text: test_model.generate_answer(text))

In [8]:
questionaire.to_csv('questionaire (part 1).csv', index = False)

In [13]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

sample_text_list = ['what is the weather like today?', 'how much time it takes to travel from sydney to colombia?']

sample_embeddings = model.encode(sample_text_list, convert_to_numpy = True).tolist()

sample_embeddings


[[-0.042056191712617874,
  -0.015622524544596672,
  0.061681073158979416,
  0.03758372366428375,
  0.08791098743677139,
  -0.013381442055106163,
  0.08970431238412857,
  -0.019505010917782784,
  -0.05775909870862961,
  -0.007809902541339397,
  -0.023745311424136162,
  0.04512391611933708,
  0.002269440097734332,
  -0.027883771806955338,
  0.06320180743932724,
  0.004270318895578384,
  0.06568528711795807,
  -0.04171053692698479,
  0.0031763582956045866,
  -0.017916424199938774,
  -0.13885195553302765,
  -0.003921222873032093,
  -0.11682962626218796,
  0.07509056478738785,
  0.03503072261810303,
  0.12869425117969513,
  -0.05015179514884949,
  0.036415696144104004,
  -0.01216441486030817,
  0.00784334447234869,
  -0.05434086173772812,
  0.023159800097346306,
  0.07504690438508987,
  0.005582111421972513,
  -0.11912225931882858,
  -0.009309345856308937,
  0.057026538997888565,
  -0.07828295230865479,
  -0.06283681839704514,
  -0.0016864400822669268,
  0.04277515038847923,
  -0.0756142958

In [16]:
questionaire['embeddings - answers'] = model.encode(questionaire['answers'].to_list(), convert_to_numpy = True).tolist()
questionaire.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   questions             282 non-null    object
 1   answers               282 non-null    object
 2   answers (no context)  282 non-null    object
 3   embeddings - answers  282 non-null    object
dtypes: object(4)
memory usage: 8.9+ KB


In [20]:
questionaire['embeddings - answers (no context)'] = model.encode(questionaire['answers (no context)'].to_list(), convert_to_numpy = True).tolist()
questionaire.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 5 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   questions                          282 non-null    object
 1   answers                            282 non-null    object
 2   answers (no context)               282 non-null    object
 3   embeddings - answers               282 non-null    object
 4   embeddings - answers (no context)  282 non-null    object
dtypes: object(5)
memory usage: 11.1+ KB


In [21]:
questionaire.to_csv('questionaire (answers with embeddings).csv', index = False)


### calculate distances

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

embedding1 = np.array([questionaire['embeddings - answers'][0]])
embedding2 = np.array([questionaire['embeddings - answers (no context)'][0]])
distance = cosine_similarity(embedding1, embedding2)[0][0]
print(f"the distance between embeding 1 and 2 is {distance:.3f}")

the distance between embeding 1 and 2 is 0.664


In [30]:
def calculate_similarity(
    vector1 : list,
    vector2 : list
):
    v1 = np.array(vector1).reshape(1, -1)
    v2 = np.array(vector2).reshape(1, -1)
    return cosine_similarity(v1, v2)[0][0]

questionaire['distance - (truth vs no truth)'] = questionaire.apply(lambda row: calculate_similarity(row['embeddings - answers'], row['embeddings - answers (no context)']), axis = 1)
questionaire.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 6 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   questions                          282 non-null    object 
 1   answers                            282 non-null    object 
 2   answers (no context)               282 non-null    object 
 3   embeddings - answers               282 non-null    object 
 4   embeddings - answers (no context)  282 non-null    object 
 5   distance - (truth vs no truth)     282 non-null    float64
dtypes: float64(1), object(5)
memory usage: 13.3+ KB
