In [9]:
from openai import AsyncOpenAI

openrouter_client = AsyncOpenAI(
    base_url=f"https://openrouter.ai/api/v1",
    api_key=""  # api-ключ,
    timeout=30
)

glama_client = AsyncOpenAI(
    base_url="https://glama.ai/api/gateway/openai/v1",
    api_key=""  # api-ключ
)

import asyncio
import random

from copy import deepcopy

models_key = ['gemini-2.0', 'llama-3.3', 'qwen-2.5', 'mistral-nemo',
              'moonlight-16b-a3b', 'gemma-3-27b', 'deephermes-3-8b', 'ds-distill-llama']
models = ['gemini-2.0-flash-thinking-exp-01-21',
          'meta-llama/llama-3.3-70b-instruct:free',
          'qwen/qwen-2.5-72b-instruct:free',
          'mistralai/mistral-nemo:free',
          'moonshotai/moonlight-16b-a3b-instruct:free',
          'google/gemma-3-27b-it:free',
          'nousresearch/deephermes-3-llama-3-8b-preview:free',
          'deepseek/deepseek-r1-distill-llama-70b:free']
models_dict = dict(zip(models_key, models))

class LLM:
    def __init__(self, use_cache=True, base_model=None, local_client=openrouter_client):
        self.reset(use_cache, base_model)
        self.lock = asyncio.Lock()
        self.local_client = local_client
    
    def reset(self, use_cache=True, base_model=None):
        self.cache = {}
        self.use_cache = use_cache
        self.base_model = base_model

    async def _chat_base(self, messages, temperature=0, top_p=1, max_tokens=4096): 
        if isinstance(messages, str):
            messages = [{"role": "user", "content": messages}]
            
        model = models_dict.get(self.base_model, self.base_model)
        cache_id = str(messages) + model
        
        if cache_id in self.cache and temperature == 0 and self.use_cache:
            return deepcopy(self.cache[cache_id])
            
        try:
            response = await self.local_client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
                timeout=60
            )
            
            self.cache[cache_id] = [choice.message.content for choice in response.choices]
            return deepcopy(self.cache[cache_id])
            
        except Exception as e:
            print(f"Chat error: {str(e)}")
            raise

    async def chat(self, messages, temperature=0, top_p=1, max_tokens=4096):
        return await self._chat_base(messages, temperature, top_p, max_tokens)

    async def chat_serial(self, messages, temperature=0, top_p=1, max_tokens=4096):
        async with self.lock:
            response = await self._chat_base(
                messages,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens
            )

            return response 

In [11]:
gemini_llm = LLM(base_model='gemini-2.0', local_client=glama_client)
llama_llm = LLM(base_model='llama-3.3')
qwen_llm = LLM(base_model='qwen-2.5')
mistral_llm = LLM(base_model='mistral-nemo')
moonlight_llm = LLM(base_model='moonlight-16b-a3b')
gemma_llm = LLM(base_model='gemma-3-27b')
deephermes_llm = LLM(base_model='deephermes-3-8b')
ds_distill_llama_llm = LLM(base_model='ds-distill-llama')

In [None]:
import pandas as pd
from tqdm.auto import tqdm
from openpyxl import load_workbook

In [14]:
async def initialize_llm(llm, handshake_prompt):
    try:
        response = await llm.chat(handshake_prompt)
        if response:
            print(f"Model initialization response: {response[0]}")
            return response[0].lower().strip().rstrip('.') in ["yes", "y"]
        return False
    except Exception as e:
        print(f"Initialization error: {str(e)}")
        return False

In [15]:
def results_to_original_df(excel_file, results_df, sheet_name):
    book = load_workbook(excel_file)
    if sheet_name in book.sheetnames:
        del book[sheet_name]
    
    book.save(excel_file)
    book.close()

    with pd.ExcelWriter(excel_file, engine='openpyxl', mode='a') as writer:
        results_df.to_excel(writer, sheet_name=sheet_name, index=False)

    return pd.read_excel(excel_file, sheet_name=sheet_name)

## 1 эксперимент

In [7]:
exp1_excel = 'exp1_dataset.xlsx'
exp1_df = pd.read_excel(exp1_excel, sheet_name = 'base sheet')
all_items = sorted(exp1_df['Item'].unique())

In [8]:
HANDSHAKE_PROMPT_1 = """
Hi! I want you to help me with my experiment.
I will send groups of English sentences. For each sentence, rate its naturalness from 1 (completely unnatural) to 7 (completely natural).

RULES:
1. Return ONLY a comma-separated list of numbers (e.g., "5,3,6")
2. No explanations, no formatting, just numbers
3. Maintain exact order of input sentences

Do you understand the instructions? Answer strictly "yes" or "no".
"""

In [9]:
async def process_exp1_dataset(df, llm):
    results_df = pd.DataFrame(columns=['Item', 'Sentence', 'Rating'])

    for item in tqdm(all_items, desc="Processing groups"):
        sentences = df[df['Item'] == item]['Sentence'].tolist()

        prompt = """Rate each sentence's naturalness (1-7). Return EXACTLY 6 numbers separated by commas, like: 5,3,6,2,4,7

        Sentences:
        """ + "\n".join(f"{i+1}. {s}" for i,s in enumerate(sentences))

        try:
            response = await llm.chat_serial(prompt)
            raw_response = response[0] if response else ""
            print(f"\nRAW RESPONSE:\n{raw_response}\n")

            # парсинг
            ratings = []
            if raw_response:
                parts = raw_response.replace(" ", "").split(",")
                for part in parts:
                    try:
                        num = int(part)
                        ratings.append(num if 1 <= num <= 7 else None)
                    except (ValueError, TypeError):
                        ratings.append(None)

            new_rows = []
            for _, (sent, rating) in enumerate(zip(sentences, ratings)):
                new_rows.append({
                    'Item': item,
                    'Sentence': sent,
                    'Rating': rating
                })
            
            if new_rows:
                results_df = pd.concat([
                    results_df, 
                    pd.DataFrame(new_rows)
                ], ignore_index=True)

        except Exception as e:
            print(f"Error in item {item}: {str(e)}")
            return None
    
    return results_df

In [None]:
if not await initialize_llm(ds_distill_llama_llm, HANDSHAKE_PROMPT_1):
    raise ValueError("Model initialization failed")
print("Model ready")

In [None]:
results_exp1 = await process_exp1_dataset(exp1_df, ds_distill_llama_llm)
display(results_exp1)

In [None]:
results_to_original_df(exp1_excel, results_exp1, 'ds_distill_llama')

## 2 эксперимент

In [35]:
exp2_excel = 'exp2_dataset.xlsx'
exp2_df = pd.read_excel(exp2_excel, sheet_name = 'base sheet')

In [36]:
HANDSHAKE_PROMPT_2 = """
Hi! I want you to participate in my experiment.

TASK STRUCTURE:
1. I will send you dialogues in the following format:
   <Dialogue N>
   Context: [4 context sentences]
   Sentence to evaluate: [last sentence]
   Questions:
   1. [question 1]
   2. [question 2]
   </Dialogue N>

2. For each dialogue, respond with answers for both question 1 and question 2.

Do you understand the instructions? Answer strictly "yes" or "no".
"""

In [37]:
async def process_batch_exp2(batch, llm, results_df):
    new_rows = []

    for i, (_, row) in enumerate(batch.iterrows()):
        context = f"Context: {row['sentence1']} {row['sentence2']} {row['sentence3']} {row['sentence4']}"
        response = row['sentence5']
        question1 = f"Question 1: {row['question1']}"
        question2 = f"Question 2: {row['question2']}"

        prompt = f"""
            <Dialogue {i+1}>
            {context}
            {response}
            Questions for you:
            1. {question1}
            Please provide the score only from 1 (completely unnatural) to 7 (completely natural). Do not send any comments or reasoning.
            Next question:
            2. {question2}
            Please provide the score only from 1 (absolutely no) to 7 (absolutely yes). Do not send any comments or reasoning.
            </Dialogue {i+1}>
            """

        try:
            llm_response = await llm.chat_serial(prompt)
            raw_response = llm_response[0] if llm_response else ""
            print(f"\nRAW RESPONSE for item {row['Item']}:\n{raw_response}\n")

            new_rows.append({
                'Group': row['Group'],
                'question1': row['question1'],
                'Q1_answer': None,
                'question2': row['question2'],
                'Q2_answer': None,
                'sentence1': row['sentence1'],
                'sentence2': row['sentence2'],
                'sentence3': row['sentence3'],
                'sentence4': row['sentence4'],
                'sentence5': row['sentence5'],
                'Item': row['Item'],
                'Condition': row['Condition'],
                'raw response': raw_response
                    })  

        except Exception as e:
            print(f"Error processing item {row['Item']}: {str(e)}")
        
        await asyncio.sleep(2)

    results_df = pd.concat([results_df, pd.DataFrame(new_rows)], ignore_index=True)
    return results_df

In [38]:
async def process_all_batches_exp2(df, llm):
    results_df = pd.DataFrame(columns=[
        'Group',
        'question1', 'Q1_answer', 'question2', 'Q2_answer',
        'sentence1', 'sentence2', 'sentence3', 'sentence4',
        'sentence5', 'Item', 'Condition', 'raw response'
    ])

    for i in tqdm(range(0, len(df), 3), desc="Processing batches"):
        batch = df.iloc[i:i+3]
        results_df = await process_batch_exp2(batch, llm, results_df)

        await asyncio.sleep(random.randint(10, 20))

    return results_df

In [None]:
if not await initialize_llm(moonlight_llm, HANDSHAKE_PROMPT_2):
    raise ValueError("Model initialization failed")
print("Model ready")

In [None]:
results_exp2 = await process_all_batches_exp2(exp2_df, moonlight_llm)
display(results_exp2)

In [None]:
results_to_original_df(exp2_excel, results_exp2, 'moonlight')

## 3 эксперимент

In [16]:
exp3_excel = 'exp3_dataset.xlsx'
exp3_df = pd.read_excel(exp3_excel, sheet_name = 'base sheet')

In [17]:
HANDSHAKE_PROMPT_3 = """
Hi! I want you to participate in my experiment.

TASK STRUCTURE:
1. I will send you dialogues in the following format:
   <Dialogue N>
   Context: [3 context sentences]
   Sentence to evaluate: [last sentence]
   Questions:
   1. [question 1]
   2. [question 2]
   </Dialogue N>

2. For each dialogue, respond with answers for both question 1 and question 2.

Do you understand the instructions? Answer strictly "yes" or "no".
"""

In [18]:
async def process_batch_exp3(batch, llm, results_df):
    new_rows = []

    for i, (_, row) in enumerate(batch.iterrows()):
        context = f"Context: {row['S1']} {row['S2']} {row['S3']}"
        response = row['S4']
        question1 = f"Question 1: {row['Q1']}"
        question2 = f"Question 2: {row['Q2']}"

        prompt = f"""
            <Dialogue {i+1}>
            {context}
            {response}
            Questions for you:
            1. {question1}
            Please provide the score only from 1 (absolutely no) to 7 (absolutely yes) without any comments or reasoning
            2. {question2}
            Please provide the score only from 1 (absolutely no) to 7 (absolutely yes) without any comments or reasoning
            </Dialogue {i+1}>
            """

        for attempt in range(2):
            try:
                llm_response = await llm.chat_serial(prompt)
                raw_response = llm_response[0] if llm_response else ""
                print(f"\nRAW RESPONSE for item {row['item']}:\n{raw_response}\n")

                new_rows.append({
                    'item': row['item'],
                    'condition': row['condition'],
                    'Q1': row['Q1'],
                    'Q1_answer': None,
                    'Q2': row['Q2'],
                    'Q2_answer': None,
                    'raw response': raw_response,
                    'S1': row['S1'],
                    'S2': row['S2'],
                    'S3': row['S3'],
                    'S4': row['S4'],
                })
                break

            except Exception as e:
                if attempt == 0:
                    print(f"Error on first attempt for item {row['item']}: {str(e)}. Retrying...")
                    await asyncio.sleep(1)
                else:
                    print(f"Second attempt failed for item {row['item']}. Skipping.")

            await asyncio.sleep(2)

    results_df = pd.concat([results_df, pd.DataFrame(new_rows)], ignore_index=True)
    return results_df

In [19]:
async def process_all_batches_exp3(df, llm):
    results_df = pd.DataFrame(columns=[
        'item', 'condition',
        'Q1', 'Q1_answer', 'Q2', 'Q2_answer',
        'raw response', 'S1', 'S2', 'S3', 'S4'
    ])

    for i in tqdm(range(0, len(df), 4), desc="Processing batches"):
        batch = df.iloc[i:i+4]
        results_df = await process_batch_exp3(batch, llm, results_df)

        await asyncio.sleep(random.randint(10, 20))

    return results_df

In [None]:
if not await initialize_llm(gemini_llm, HANDSHAKE_PROMPT_3):
    raise ValueError("Model initialization failed")
print("Model ready")

In [None]:
results_exp3 = await process_all_batches_exp3(exp3_df, gemini_llm)
display(results_exp3)

In [None]:
results_to_original_df(exp3_excel, results_exp3, 'gemini')