Before staring working with groq, register on the website and receive your own unike API key. 

In [1]:
from groq_api import KEY
from groq import Groq
import numpy as np
import pandas as pd

from tqdm import tqdm


CLIENT = Groq(api_key=KEY)
PATH = 'data/power/power-ua-train.tsv'


Read csv file as a pandas dataframe, set index to the index from the csv file, add "prediction" and "reasoning" columns.
Lastly, shaffle the data (by sampling all the rows in random order), setting random state for reproducibility.

In [2]:
data = pd.read_csv(PATH, sep='\t', index_col=0)
# set object column dtype full of nans
data['prediction'] = np.nan
data['reasoning'] = np.nan
data['labeled_prediction'] = np.nan
data = data.sample(frac=1, random_state=42)
data.head()


Unnamed: 0_level_0,speaker,sex,text,text_en,label,prediction,reasoning,labeled_prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ua05796,d18023ad9b6df26ee266bb01fc4c822a,M,"Олександр Шевченко, фракція ""<PARTY>"". Ратифік...","Alexander Shevchenko, Freedom faction. Ratific...",1,,,
ua07362,bea4118a3082afea430ffe03d4343b76,M,"Шановний пане Голово, шановні колеги, дозвольт...","Dear Mr. Head, honourable colleagues, let me i...",0,,,
ua10560,b07c8d09c03a6a3222057c300c84e9fe,M,"Олег Купрієнко, <PARTY>. А ще порядок в Україн...","Oleg Kuprienko, Oleg Laska Radical Party. And ...",1,,,
ua10327,93a1d11f680ed07b734319a5fae11cee,F,"По-перше, що стосується військового госпіталю,...","First of all, it's about a military hospital i...",0,,,
ua15066,e7d36da025052d2213a757d55deeb1d1,M,Шановна президія! Шановні народні депутати! Ре...,Honorable Predestination! Dear National Deputi...,0,,,


In [3]:
print('Size of data: ', data.shape[0])

Size of data:  11324


Here, I am engineering my prompt and writing a function to send API calls to groq.


In [4]:
PROMPT = '''Help me identify whether this speech was held by coalition or opposition. First reason about the text and try to find helpful cues. Then provide a final answer. Formulate your response in JSON using the following format: {"reasoning": "...", "prediction": "coalition or opposition"}. Make sure to return proper JSON, e.g. don't add superfluous single quotation marks in the end. Don't use any other words in the response, except for "coalition" or "opposition".'''

In [6]:
import json


def get_response(text):
    messages=[
            {
                "role": "system",
                "content": PROMPT
            },
            {
                "role": "user",
                "content": text,
            },
        ]
    for i in range(5):
        try:
            # in a try loop, 
            # I am increasing temperature to get a response in case of an error generating JSON
            chat_completion = CLIENT.chat.completions.create(
                messages=messages,
                model="llama3-70b-8192",
                response_format={"type": "json_object"},
                temperature=0.2 * i,             
            )
            response_content = chat_completion.choices[0].message.content
            response = json.loads(response_content)
            return response
        except Exception as e:
            print(e)
            continue
    else:
        return None

Loop over 20% of the data, sending calls to Llama3, and calculate number of correct predictions. 

In [None]:
NUM_SAMPLES = int(data.shape[0] * 0.2)  # 20% of the data
error_counter = 0

for index, entry in (tqdm_bar := tqdm(data.iloc[:NUM_SAMPLES].iterrows(), total=NUM_SAMPLES)):
    text = entry['text_en']
    response = get_response(text)
    if not response:
        error_counter += 1
        continue
    if response['prediction'] not in ['coalition', 'opposition']:
        error_counter += 1
        continue
    data.loc[index, 'prediction'] = response['prediction']
    data.loc[index, 'reasoning'] = response['reasoning']
    data.loc[index, 'labeled_prediction'] = int(data.loc[index, 'prediction'] == 'opposition')

    # Update true/false count to see in the progress bar 
    data_subset = data.loc[:index].dropna(subset=['labeled_prediction', 'prediction'])
    distribution = (data_subset['labeled_prediction'] == data_subset['label']).value_counts(normalize=True)
    tqdm_bar.set_description(f"Accuracy: {distribution.get(True, 0):.2f}")
    

In [13]:
data_subset = data.dropna(subset=['labeled_prediction', 'prediction'])
distribution = (data_subset['labeled_prediction'] == data_subset['label']).value_counts(normalize=True)
print(f"Accuracy: {distribution.get(True, 0):.2f}")
print(f'Errors:', error_counter)

Accuracy: 0.64
Errors: 14


Save results to the folder "results".

In [None]:
import os 

FOLDER_PATH = 'results'

if not os.path.exists(FOLDER_PATH):
    os.makedirs(FOLDER_PATH)

data.to_csv('results/power-ua-results-en.tsv', sep='\t')