In [None]:
import pandas as pd
from src.vRA import RaLLM
from src.llm_coder import deductive_coding
from utils.krippendorff_alpha import krippendorff
from openai import OpenAI
import json

In [None]:
# Read data and codebook from CSV files
data = pd.read_csv("./data/data_example.csv")
codebook = pd.read_csv("./data/codebook_example.csv")
model = 'gpt-3.5-turbo'
#number of in-context examples
number_of_example = 5
#including context column in example data
context = True
#if NA label if none of the code applies
na_label = False
#language setting, 'eng', 'ch', 'fr'
language = 'eng'
#for majority voting machienism. the number indicates the number of voters.
voter = 1
#for Chain-of-thought 
cot = False
#For Open AI batch API
#https://platform.openai.com/docs/guides/batch/overview
batch = False
api_key = "$LLM_API_KEY"


In [None]:
# For non-open-ai-models
client = OpenAI(
    api_key = "$MOONSHOT_API_KEY",
    base_url = "https://api.moonshot.cn/v1",
)

In [None]:
# For open-ai-models
client = OpenAI(
    api_key = api_key
)

In [None]:
results, code_set = deductive_coding(data, codebook, codebook_format = 'codebook', number_of_example = number_of_example, context = context, na_label = na_label, language = language, model = model,voter = voter, cot = cot, client = client, batch = batch)

In [None]:
# For verification
print("Cohen's Kappa: %.3f" %RaLLM.cohens_kappa_measure(data['code'].astype(str), data['results']))
print("Krippendorff's Alpha: %.3f" %RaLLM.krippendorff_alpha_measure(data['code'].astype(str), data['results'],code_set))


In [None]:
# Output Data
results.to_csv("data_example_output.csv", encoding="utf_8_sig", index=False)

In [None]:
#For batch processing
# https://platform.openai.com/docs/guides/batch/overview
batch_name = 'batch_input'
with open('./results/'+batch_name+'.jsonl', 'w') as file:
    for item in results:
        json_line = json.dumps(item)
        file.write(json_line + '\n')
client = OpenAI(api_key = api_key)
batch_description = 'batch_coding'
batch_input_file = client.files.create(
  file=open('./results/'+batch_name+'.jsonl', "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch_meta = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": batch_description
    }
)

In [None]:
# Processing batch results

# Load the JSONL file
file_path = 'path/to/results/jsonl'

# Read the JSONL file into a list of dictionaries
with open(file_path, 'r') as file:
    data_source = [json.loads(line) for line in file]

# Extract the content and custom_id from each dictionary with the correct path
extracted_data = [
    {
        'custom_id': item['custom_id'], 
        'content': item['response']['body']['choices'][0]['message']['content']
    } 
    for item in data_source
]
extracted_data = pd.DataFrame(extracted_data)
results = RaLLM.code_clean(extracted_data['content'],code_set)
# Convert to a DataFrame
data['results'] = pd.Series(results)

In [None]:
# Output Data
data.to_csv("data_example_output.csv", encoding="utf_8_sig", index=False)