The main purpose of this file is to detect reasons of aggressive driving through user comments. </br>
To do that we are going to use </br>
1. A pretrained classification model that will detect comments describing aggressive driving
2. A Q&A model, which will get the reason of aggresssive driving through answering the following question: **"как выражалось агрессивное вождение"**

# Settings

In [None]:
# install neede libraries
!pip install transformers
!pip install sentencepiece

In [None]:
# define device
import torch
from torch import nn

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Preprocessing

In [None]:
#text preprocessig function
def remove_punkt(text: str) -> str:
    text = ''.join(map(lambda c: c if c.isalpha() else ' ', text.lower()))
    return text

# Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
train_labeled = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Контесты/AI IJC/team_tasks/task_2/Data_rus/labled_train_data.csv', index_col=0, sep="\t")
comments_labeled = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Контесты/AI IJC/team_tasks/task_2/Data_rus/labled_train_comments.csv', index_col=0, sep="\t")

comments_unlabeled = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Контесты/AI IJC/team_tasks/task_2/Data_rus/unlabled_train_comments.csv', index_col=0, sep="\t")

# pseudo_labeled = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Контесты/AI IJC/team_tasks/task_2/text_processing/pseudo_labeled_data/labeled_comments_clean_xlm_roberta_20_epochs.csv', index_col=0)
# pseudo_unlabeled = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Контесты/AI IJC/team_tasks/task_2/text_processing/pseudo_labeled_data/unlabeled_comments_clean_xlm_roberta_20_epochs.csv', index_col=0)
# whole_data = pseudo_labeled.append(pseudo_unlabeled)

test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Контесты/AI IJC/team_tasks/task_2/Data_rus/labled_test_data.csv', index_col=0, sep="\t")

Use only clean comments form test set

In [None]:
test_comments = test.comment[(test.comment != "Больше нечего сказать") & (test.comment != "Да")]
test_comments

In [None]:
question = "как выражалось агрессивное вождение"

# Classification

In [None]:
from transformers import AutoModel
from torch import nn


class XLMRobertaBaseClassifier(nn.Module):
    def __init__(self):
        super(XLMRobertaBaseClassifier, self).__init__()
        self.base_model = AutoModel.from_pretrained('sismetanin/xlm_roberta_base-ru-sentiment-rusentiment')
        self.Linear = nn.Linear(768, 2)
        
    def forward(self, input_ids, token_type_ids, attention_mask):
        outputs = self.base_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
        outputs = self.Linear(outputs)

        return outputs


## Load the best text classification model

In [None]:
classification_model = XLMRobertaBaseClassifier()
classification_model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/Контесты/AI IJC/team_tasks/task_2/text_processing/base_models/uda_xlm_roberta_base_3_epochs.h5"))
classification_model.to(device)

# Q&A

## Model

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import sentencepiece

qa_model = AutoModelForQuestionAnswering.from_pretrained("AlexKay/xlm-roberta-large-qa-multilingual-finedtuned-ru")

tokenizer = AutoTokenizer.from_pretrained("AlexKay/xlm-roberta-large-qa-multilingual-finedtuned-ru", do_lower_case=True)

## Predictions

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

sigmoid = torch.nn.Sigmoid()


for comment in list(test_comments):
    clean_comment = remove_punkt(comment)

    inputs = tokenizer(question, clean_comment, add_special_tokens=True, return_tensors="pt")

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    classification_output = classification_model(
        input_ids=input_ids.to(device), 
        token_type_ids=None,
        attention_mask=attention_mask.to(device)
    )
    probas = sigmoid(classification_output)
    print(probas)
    if probas[:, 1] < 0.30:
        
        print(f"Comment: {comment}")
        print(f"Answer: Нет причины агрессивного вождения", end="\n ------------------ \n")

    else:
        qa_outputs = qa_model(
            input_ids=input_ids, 
            token_type_ids=None,
            attention_mask=attention_mask
        )

        answer_start_scores = qa_outputs.start_logits
        answer_end_scores = qa_outputs.end_logits

        answer_start = torch.argmax(
            answer_start_scores
        )  # Get the most likely beginning of answer with the argmax of the score
        
        answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
        ids = input_ids[0][answer_start:answer_end]
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))

        clean_answer = answer.replace("<s>", " ").replace("</s>", " ").replace(question, "").strip()
        if clean_answer == "":
            clean_answer = "Причина агрессивного вождения не распознана"

        print(f"Comment: {comment}")
        # print(f"Question: {question}")
        print(f"Answer: {clean_answer}", end="\n ------------------ \n")