In [None]:
import transformers
import accelerate

print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
import pandas as pd
from datasets import Dataset

df = pd.read_excel("no_voice.xlsx")

# Загрузка токенайзера и модели BERT
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

dataset = Dataset.from_pandas(df[['human_markup', 'label']])

def tokenize_function(examples):
    return tokenizer(examples['human_markup'], truncation=True, padding='max_length', max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(['human_markup'])

# Установление форматов для PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # Каталог для сохранения результатов
    num_train_epochs=3,              # Количество эпох обучения
    per_device_train_batch_size=8,   # Размер батча на каждом устройстве
    logging_dir='./logs',            # Каталог для логов
    logging_steps=10,
)

# Создание DataCollator для обработки батчей
data_collator = DataCollatorWithPadding(tokenizer)

# Создание объекта Trainer
trainer = Trainer(
    model=model,                         # Модель для дообучения
    args=training_args,                  # Аргументы обучения
    train_dataset=tokenized_datasets,    # Датасет для обучения
    data_collator=data_collator          # Объект для обработки батчей
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_distances
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import pandas as pd
import torch

# Загрузка модели и токенайзера
model_name = './fine_tuned_model'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.eval()

df = pd.read_excel('no_voice.xlsx')
df = df[['model_annotation', 'label']]

# Функция для генерации эмбеддингов
def generate_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
        with torch.no_grad():
            outputs = model(**inputs)
        # Используем эмбеддинги из последнего скрытого слоя
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return embeddings

# Генерация эмбеддингов для текстов
df['embeddings'] = generate_embeddings(df['model_annotation'])


def calculate_cosine_distances(embeddings, reference_embeddings):
    distances = cosine_distances(embeddings, reference_embeddings)
    return distances.mean(axis=1)

reference_embeddings = np.array(df[df['label'] == 0]['embeddings'].tolist())

df['cosine_distance'] = calculate_cosine_distances(np.array(df['embeddings'].tolist()), reference_embeddings)

embeddings = np.array(df['embeddings'].tolist())


df['is_correct'] = df['label'] == 0

X = embeddings
y = df['is_correct'].astype(int)  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

svm_model = make_pipeline(StandardScaler(), SVC(kernel='linear', probability=True))

# Train the SVM classifier
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
y_pred_prob = svm_model.predict_proba(X_test)[:, 1]  


df_test = df.iloc[X_test.index]
df_test['predicted_is_correct'] = y_pred
df_test['predicted_is_correct_prob'] = y_pred_prob


print("Correct texts predicted:", df_test[df_test['predicted_is_correct'] == 1])
print("Incorrect texts predicted:", df_test[df_test['predicted_is_correct'] == 0])

from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
df.to_csv("with_emeddings_new.csv")

In [None]:
df = pd.read_csv("df_with_correctness.csv")

In [None]:
# Отфильтровать строки, где label = 1
label_1_df = df[df['label'] == 1]
average_cosine_distance_label_1 = label_1_df['cosine_distance'].mean()
print("Для 1 : ", average_cosine_distance_label_1)

# Отфильтровать строки, где label = 0
label_0_df = df[df['label'] == 0]
average_cosine_distance_label_0 = label_0_df['cosine_distance'].mean()
print("Для 0 : ", average_cosine_distance_label_0)

In [None]:
def countMetrics(df, threshhold):
    condition1 = (df['cosine_distance'] <= threshhold) & (df['label'] == 0)
    count_condition1 = len(df[condition1])

    # Фильтрация строк, где расстояние больше 0.6 и label = 1
    condition2 = (df['cosine_distance'] > threshhold) & (df['label'] == 1)
    count_condition2 = len(df[condition2])

    condition3 = (df['cosine_distance'] <= threshhold) & (df['label'] == 1)
    count_condition3 = len(df[condition3])

    condition4 = (df['cosine_distance'] > threshhold) & (df['label'] == 0)
    count_condition4 = len(df[condition4])
    precision = count_condition1 / (count_condition1 + count_condition3)
    recall = count_condition1 / (count_condition1 + count_condition4)
    F1 = 2 * (precision * recall) / (precision + recall)
    Accuracy = (count_condition1 + count_condition2) / (count_condition1 + count_condition2 + count_condition3 + count_condition4)
    print("For threshhold: ", threshhold)
    print("precision: ", precision)
    print("recall: ", recall)
    print("Accuracy: ", Accuracy)
    print("F1: ", F1)
    return (Accuracy, F1)

In [None]:
th = 0.5

max_f1 = 0
max_acc = 0
th_for_max_f1 = 0
th_for_max_acc = 0
while th <= 0.65:
    acc, f1 = countMetrics(df, th)
    if acc > max_acc:
        max_acc = acc
        th_for_max_acc = th
    if f1 > max_f1:
        max_f1 = f1
        th_for_max_f1 = th
    th += 0.01
    
print("Top Accuracy: ", max_acc, " for threshhold ", th_for_max_acc)
print("Top F1: ", max_f1, " for threshhold ", th_for_max_f1)

In [None]:
def countMetrics2(df, threshhold):
    condition1 = (df['cosine_distance'] <= threshhold) & (df['label'] == 0)
    count_condition1 = len(df[condition1])

    condition2 = (df['cosine_distance'] > threshhold) & (df['label'] == 1)
    count_condition2 = len(df[condition2])

    condition3 = (df['cosine_distance'] <= threshhold) & (df['label'] == 1)
    count_condition3 = len(df[condition3])

    condition4 = (df['cosine_distance'] > threshhold) & (df['label'] == 0)
    count_condition4 = len(df[condition4])
    precision = count_condition1 / (count_condition1 + count_condition3)
    recall = count_condition1 / (count_condition1 + count_condition4)
    F1 = 2 * (precision * recall) / (precision + recall)
    Accuracy = (count_condition1 + count_condition2) / (count_condition1 + count_condition2 + count_condition3 + count_condition4)
    print("For threshhold: ", threshhold)
    print("precision: ", precision)
    print("recall: ", recall)
    print("Accuracy: ", Accuracy)
    print("F1: ", F1)
    return (Accuracy, F1)