In [None]:
import os
import time

import boto3
import pandas as pd
import torch
from tqdm import tqdm


tqdm.pandas()
from io import StringIO

from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

## загружаем данные

In [None]:
load_dotenv()

S3_KEY_ID = os.environ.get("S3_KEY_ID")
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY")
S3_BUCKET = os.environ.get("S3_BUCKET")
BUCKET_DIR = "splitted_data/"
FILENAME = "splitted_df_3000.csv"

In [None]:
session = boto3.session.Session()
s3 = session.client(
    service_name="s3",
    endpoint_url="https://storage.yandexcloud.net",
    aws_access_key_id=S3_KEY_ID,
    aws_secret_access_key=S3_SECRET_KEY,
    region_name="ru-cental1",
)

In [None]:
csv_obj = s3.get_object(Bucket=S3_BUCKET, Key=BUCKET_DIR + FILENAME)

In [None]:
df = pd.read_csv(StringIO(csv_obj["Body"].read().decode("utf-8")))

In [None]:
tgt2name = {
    "author_id_00": "А. Пушкин",
    "author_id_01": "Д. Мамин-Сибиряк",
    "author_id_02": "И. Тургенев",
    "author_id_03": "А. Чехов",
    "author_id_04": "Н. Гоголь",
    "author_id_05": "И. Бунин",
    "author_id_06": "А. Куприн",
    "author_id_07": "А. Платонов",
    "author_id_08": "В. Гаршин",
    "author_id_09": "Ф. Достоевский",
}

In [None]:
# замена значений столбца target фамилиями авторов
df.replace(tgt2name, inplace=True)

df.head()

# вывод на экран первого текста
print(df.target[0])
print(df.book[0])
print(df.text[0])

In [None]:
df.to_csv("rus_authors.csv", index=False)

## LLM

In [None]:
config = dict(
    max_new_tokens=512,
    do_sample=True,
    num_beams=1,
    temperature=0.25,
    top_k=50,
    top_p=0.98,
    eos_token_id=79097,
)

### saiga_llama3_8b

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "IlyaGusev/saiga_llama3_8b",
    device_map="auto",
    # attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained("IlyaGusev/saiga_llama3_8b")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
DEFAULT_SYSTEM_PROMPT = "Ты получишь тексты, принадлежащие русским писателям 19 века - А. Пушкину, Д. Мамин-Сибиряку, И. Тургеневy, А. Чехову, Н. Гоголю, И. Бунину, А. Куприну, А. Платонову, В. Гаршину, Ф. Достоевскому. Напиши кому из них принадлежит каждый текст, текстов других писателей не будет"

df["saiga_author"] = None
df["saiga_process_time"] = None

for i, row in tqdm(df.iterrows(), total=len(df)):
    t_start = time.time()

    prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
            {"role": "user", "content": row["text"]},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )
    output = pipe(prompt, **config)
    output = output[0]["generated_text"][len(prompt) :].strip()

    df.at[i, "saiga_author"] = output
    df.at[i, "saiga_process_time"] = time.time() - t_start

In [None]:
df.head()

### Vikhr-7B-instruct_0.4

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "Vikhrmodels/Vikhr-7B-instruct_0.4",
    device_map="auto",
    # attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained("Vikhrmodels/Vikhr-7B-instruct_0.4")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
df["vikhr_author"] = None
df["vikhr_process_time"] = None

for i, row in tqdm(df.iterrows(), total=len(df)):
    t_start = time.time()

    prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
            {"role": "user", "content": row["text"]},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )
    output = pipe(prompt, **config)
    output = output[0]["generated_text"][len(prompt) :].strip()

    df.at[i, "vikhr_author"] = output
    df.at[i, "vikhr_process_time"] = time.time() - t_start

In [None]:
df.head()

### Meta-Llama-3-8B

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    device_map="auto",
    # attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
df["vikhr_author"] = None
df["vikhr_process_time"] = None

for i, row in tqdm(df.iterrows(), total=len(df)):
    t_start = time.time()

    prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
            {"role": "user", "content": row["text"]},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )
    output = pipe(prompt, **config)
    output = output[0]["generated_text"][len(prompt) :].strip()

    df.at[i, "vikhr_author"] = output
    df.at[i, "vikhr_process_time"] = time.time() - t_start

In [None]:
df["llama_author"] = None
df["llama_process_time"] = None

for i, row in tqdm(df.iterrows(), total=len(df)):
    t_start = time.time()

    # if row['num_repr_texts'] < 4:
    #     output = row['repr_text']
    # else:
    prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
            {"role": "user", "content": row["text"]},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )
    output = pipe(prompt, **config)
    print(output)
    output = output[0]["generated_text"][len(prompt) :].strip()

df.at[i, "llama_author"] = output
df.at[i, "llama_process_time"] = time.time() - t_start

In [None]:
df.head()