# Loading Data
### Importing frameworks & loading the task dataset

In [None]:
# dataset frameworks
import pandas as pd
import numpy as np
from datasets import load_dataset
# embedding frameworks
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import sys
import torch
# sentiment analysis frameworks
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

In [None]:
dataset = load_dataset("ailsntua/QEvasion", split='train')
full_dataset = dataset.to_pandas()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

In [None]:
labels_clarity_list = full_dataset['clarity_label'].unique().tolist()
labels_evasion_list = full_dataset['evasion_label'].unique().tolist()

In [None]:
print(full_dataset) # interview_question #interview_answer

                                                  title                date  \
0     The President's News Conference in Hanoi, Vietnam  September 10, 2023   
1     The President's News Conference in Hanoi, Vietnam  September 10, 2023   
2     The President's News Conference in Hanoi, Vietnam  September 10, 2023   
3     The President's News Conference in Hanoi, Vietnam  September 10, 2023   
4     The President's News Conference in Hanoi, Vietnam  September 10, 2023   
...                                                 ...                 ...   
3443                    The President's News Conference    October 25, 2006   
3444                    The President's News Conference    October 25, 2006   
3445                    The President's News Conference    October 25, 2006   
3446                    The President's News Conference    October 25, 2006   
3447                    The President's News Conference    October 25, 2006   

            president                              

# Creating New Features
### Finding new patterns to help training models with more useful data

Mean of string length per label

In [None]:
# String length mean

label_string_mean=dict()

for label in labels_evasion_list:
  target_dataset=full_dataset[full_dataset['evasion_label']==label].copy()
  target_dataset['contagem']=target_dataset['interview_answer'].str.len()
  label_string_mean[label]=target_dataset['contagem'].sum()/len(target_dataset['contagem'])
  print(f"mean of {label} = {label_string_mean[label]}")

print(label_string_mean)

mean of Explicit = 1541.5769961977187
mean of General = 1900.9222797927462
mean of Partial/half-answer = 2035.493670886076
mean of Dodging = 1511.1756373937676
mean of Implicit = 2226.409836065574
mean of Deflection = 2075.3858267716537
mean of Declining to answer = 840.0137931034483
mean of Claims ignorance = 901.5882352941177
mean of Clarification = 496.07608695652175
{'Explicit': np.float64(1541.5769961977187), 'General': np.float64(1900.9222797927462), 'Partial/half-answer': np.float64(2035.493670886076), 'Dodging': np.float64(1511.1756373937676), 'Implicit': np.float64(2226.409836065574), 'Deflection': np.float64(2075.3858267716537), 'Declining to answer': np.float64(840.0137931034483), 'Claims ignorance': np.float64(901.5882352941177), 'Clarification': np.float64(496.07608695652175)}


Embeddings cossine similarity

In [None]:
# load model
try:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device=device)
except ImportError:
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# COS similarity function
def calculate_similarity(df, q_col, a_col):
  questions = df[q_col].astype(str).tolist()
  answers = df[a_col].astype(str).tolist()

  q_embeddings = model.encode(questions, show_progress_bar=True, convert_to_numpy=True)
  a_embeddings = model.encode(answers, show_progress_bar=True, convert_to_numpy=True)

  dot_product = np.sum(q_embeddings * a_embeddings, axis=1)
  norm_q = np.linalg.norm(q_embeddings, axis=1)
  norm_a = np.linalg.norm(a_embeddings, axis=1)

  norm_q[norm_q == 0] = 1e-9
  norm_a[norm_a == 0] = 1e-9

  similarities = dot_product / (norm_q * norm_a)
  return similarities

In [None]:
full_dataset['semantic_similarity'] = calculate_similarity(full_dataset, 'interview_question', 'interview_answer')

Batches:   0%|          | 0/108 [00:00<?, ?it/s]

Batches:   0%|          | 0/108 [00:00<?, ?it/s]

In [None]:
print(full_dataset['semantic_similarity'].head())

0    0.531044
1    0.531044
2    0.553182
3    0.553182
4    0.474729
Name: semantic_similarity, dtype: float32


Sentiment analysis

In [None]:
sia = SentimentIntensityAnalyzer()
def sentiment_analysis_question(dataset):
  return sia.polarity_scores(dataset['interview_question'])
def sentiment_analysis_answer(dataset):
  return sia.polarity_scores(dataset['interview_answer'])

In [None]:
full_dataset['question_sentiment'] = full_dataset.apply(sentiment_analysis_question, axis=1)
full_dataset['answer_sentiment'] = full_dataset.apply(sentiment_analysis_answer, axis=1)

0    {'neg': 0.044, 'neu': 0.839, 'pos': 0.117, 'co...
1    {'neg': 0.044, 'neu': 0.839, 'pos': 0.117, 'co...
2    {'neg': 0.249, 'neu': 0.675, 'pos': 0.076, 'co...
3    {'neg': 0.249, 'neu': 0.675, 'pos': 0.076, 'co...
4    {'neg': 0.147, 'neu': 0.823, 'pos': 0.03, 'com...
Name: question_sentiment, dtype: object
0    {'neg': 0.031, 'neu': 0.874, 'pos': 0.095, 'co...
1    {'neg': 0.031, 'neu': 0.874, 'pos': 0.095, 'co...
2    {'neg': 0.048, 'neu': 0.835, 'pos': 0.116, 'co...
3    {'neg': 0.048, 'neu': 0.835, 'pos': 0.116, 'co...
4    {'neg': 0.029, 'neu': 0.885, 'pos': 0.086, 'co...
Name: answer_sentiment, dtype: object


In [None]:
print(full_dataset['question_sentiment'].head())
print(full_dataset['answer_sentiment'].head())

0    {'neg': 0.044, 'neu': 0.839, 'pos': 0.117, 'co...
1    {'neg': 0.044, 'neu': 0.839, 'pos': 0.117, 'co...
2    {'neg': 0.249, 'neu': 0.675, 'pos': 0.076, 'co...
3    {'neg': 0.249, 'neu': 0.675, 'pos': 0.076, 'co...
4    {'neg': 0.147, 'neu': 0.823, 'pos': 0.03, 'com...
Name: question_sentiment, dtype: object
0    {'neg': 0.031, 'neu': 0.874, 'pos': 0.095, 'co...
1    {'neg': 0.031, 'neu': 0.874, 'pos': 0.095, 'co...
2    {'neg': 0.048, 'neu': 0.835, 'pos': 0.116, 'co...
3    {'neg': 0.048, 'neu': 0.835, 'pos': 0.116, 'co...
4    {'neg': 0.029, 'neu': 0.885, 'pos': 0.086, 'co...
Name: answer_sentiment, dtype: object
