# Resultados dos experimentos
Giulia Chimini Stefainski, Leonardo Azzi Martins, Matheus de Moraes Costa

---

**Objetivo:** sumarizar os resultados, compará-los e analisá-los.

# Sumarização dos resultados
Elabora gráficos padronizados

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
bert_full_df = pd.read_csv('./bert/covidbr_bert_metrics_kfold.csv', index_col=0)
bert_full_df['dataset'] = 'covidbr_full'
bert_full_df

In [None]:
bert_nostarturl_df = pd.read_csv('./bert/covidbr_nostarturl_bert_metrics_kfold.csv', index_col=0)
bert_nostarturl_df['dataset'] = 'covidbr_no_starturl'
bert_nostarturl_df

In [None]:
bert_nourl_df = pd.read_csv('./bert/covidbr_nourl_bert_metrics_kfold.csv', index_col=0)
bert_nourl_df['dataset'] = 'covidbr_no_url'
bert_nourl_df

In [None]:
bert_df = pd.concat([bert_full_df, bert_nostarturl_df, bert_nourl_df])
bert_df

In [None]:
qwen_nostarturl_df = pd.read_csv('./qwen/qwen_no_start_url.csv', index_col=0)
qwen_nostarturl_df['dataset'] = 'covidbr_no_starturl'
qwen_nostarturl_df.columns = qwen_nostarturl_df.columns.str.lower()
qwen_nostarturl_df

In [None]:
qwen_nourl_df = pd.read_csv('./qwen/qwen_no_url.csv', index_col=0)
qwen_nourl_df["dataset"] = "covidbr_no_url"
qwen_nourl_df.columns = qwen_nourl_df.columns.str.lower()
qwen_nourl_df

In [None]:
qwen_df = pd.concat([qwen_nostarturl_df, qwen_nourl_df])
qwen_df

In [None]:
def letter_annotation(ax, xoffset, yoffset, letter):
 ax.text(xoffset, yoffset, letter, transform=ax.transAxes,
         size=12)

In [None]:
# Publication-ready plotting configuration
import matplotlib as mpl

# Set global font and style parameters for publication quality
plt.rcParams.update({
    'font.size': 24,
    'font.family': 'serif',
    'font.serif': ['Times New Roman', 'Times', 'serif'],
    'axes.titlesize': 20,
    'axes.labelsize': 24,
    'xtick.labelsize': 20,
    'ytick.labelsize': 22,
    'legend.fontsize': 24,
    'figure.titlesize': 24,
    'axes.linewidth': 1.2,
    'grid.linewidth': 0.8,
    'lines.linewidth': 1.5,
    'patch.linewidth': 1.2,
    'xtick.major.width': 1.2,
    'ytick.major.width': 1.2,
    'xtick.minor.width': 0.8,
    'ytick.minor.width': 0.8,
    'figure.dpi': 500,
    'savefig.dpi': 500,
    'savefig.bbox': 'tight',
    'savefig.pad_inches': 0.1
})

# Define consistent color palette for publication


# Define dataset labels for better presentation
dataset_labels = {
    'covidbr_full': 'Complete Dataset',
    'covidbr_no_starturl': 'No Start URL',
    'covidbr_no_url': 'No URL'
}

colors = {
    "Complete Dataset": "#66c2a5",
    "No Start URL": "#fc8d62",
    "No URL": "#8da0cb",
}


In [None]:
def letter_annotation(ax, xoffset, yoffset, letter):
    ax.text(xoffset, yoffset, letter, transform=ax.transAxes,
             weight='bold', fontfamily='serif')

def format_axis(ax, title, ylabel=None, xlabel=None):
    ax.set_title(title,  weight='bold', pad=15)
    if ylabel:
        ax.set_ylabel(ylabel, weight='bold')
    if xlabel:
        ax.set_xlabel(xlabel, weight='bold')
    ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5)
    ax.set_axisbelow(True)
    
    ax.tick_params(axis='both', which='major', width=1.2, length=4)
    ax.tick_params(axis='both', which='minor', width=0.8, length=2)
    
    # Set spine properties
    for spine in ax.spines.values():
        spine.set_linewidth(1.2)
        spine.set_color('black')

In [None]:
# Define the metrics to plot

def plot_metrics(df: pd.DataFrame, model: str):
    metrics = ['fpr', 'precision', 'recall', 'f1-macro', 'f1-micro']

    # Create individual plots for each metric
    for metric in metrics:
        plt.figure(figsize=(8, 7))

        # Create box plot with proper color palette
        palette = [colors[dataset_labels[ds]] for ds in ['covidbr_full', 'covidbr_no_starturl', 'covidbr_no_url']]
        ax = sns.boxplot(data=df, x='dataset', y=metric, 
                         order=['covidbr_full', 'covidbr_no_starturl', 'covidbr_no_url'],
                         hue='dataset', palette=palette, linewidth=1.5, legend=False)

        # Set all boxplot elements to black
        for patch in ax.artists:
            patch.set_edgecolor('black')
            patch.set_linewidth(1.2)

        # Set whiskers, caps, and medians to black
        for line in ax.lines:
            line.set_color('black')
            line.set_linewidth(1.2)

        # Apply consistent formatting
        format_axis(ax, "",
                    ylabel=metric.capitalize().replace("-", "-"))

        # Update x-axis labels with better names
        labels = [dataset_labels[ds] for ds in ['covidbr_full', 'covidbr_no_starturl', 'covidbr_no_url']]
        ax.set_xticks(range(len(labels)))
        ax.set_xticklabels(labels, rotation=45, ha='right')

        # y-ticks to .3f
        ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.3f}'))

        #
        # remove x-axis label
        ax.set_xlabel('')
        # Set appropriate y-axis limits based on metric type
        if metric is 'fpr':
            ax.set_ylim(0.05, 0.3)
        else:
            ax.set_ylim(0.7, 0.9)

        plt.tight_layout()
        print(f"Plotting {model} - {metric}")
        # Save the plot
        plt.savefig(f'./plots/{model}_{metric}.pdf', dpi=500, bbox_inches='tight')
        plt.show()

In [None]:
# Bert plots
plot_metrics(bert_df, 'bert')

In [None]:
# Qwen model plot
plot_metrics(qwen_df, 'qwen')

In [None]:
def plot_model_comparison(bert_df: pd.DataFrame, qwen_df: pd.DataFrame):
    """
    Plot comparison between BERT and Qwen models for the 'No URL' dataset
    """
    metrics = ['accuracy', 'precision', 'recall', 'f1-macro', 'f1-micro']

    
    # Add model identifier
    bert_df['model'] = 'BERT'
    qwen_df['model'] = 'Qwen'
    
    # Combine dataframes
    combined_df = pd.concat([bert_df, qwen_df], ignore_index=True)
    
    # Define colors for models
    model_colors = {
        'BERT': '#2E86AB',
        'Qwen': '#A23B72'
    }
    
    # Create individual plots for each metric
    for metric in metrics:
        plt.figure(figsize=(8, 7))
        
        # Create box plot
        palette = [model_colors[model] for model in ['BERT', 'Qwen']]
        ax = sns.boxplot(data=combined_df, x='model', y=metric, 
                         order=['BERT', 'Qwen'],
                         hue='model', palette=palette, linewidth=1.5, legend=False)
        
        # Set all boxplot elements to black
        for patch in ax.artists:
            patch.set_edgecolor('black')
            patch.set_linewidth(1.2)
        
        # Set whiskers, caps, and medians to black
        for line in ax.lines:
            line.set_color('black')
            line.set_linewidth(1.2)
        
        # Apply consistent formatting
        
        # Format y-axis to 3 decimal places
        ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.3f}'))
        
        # Remove x-axis label
        ax.set_xlabel('')
        
        # Set appropriate y-axis limits
        ax.set_ylim(0.7, 0.9)
        
        plt.tight_layout()
        print(f"Plotting Model Comparison - {metric}")
        plt.show()

# Create combined comparison plot
plot_model_comparison(bert_df, qwen_df)

# Análise dos resultados

## URLs
Como o tokenizador dos modelos experimentados representa URLs tokenizadas? Existem diferenças?

In [None]:
%pip install tiktoken==0.9.0

In [None]:
from transformers import AutoTokenizer  
import tiktoken

url = "https://g1.globo.com/mundo/noticia/2025/06/22/chanceler-do-ira-diz-que-eua-cruzaram-linha-vermelha-com-ataque-a-instalacoes-nucleares.ghtml"  

print("URL de teste", url, "\n")

# QWEN3 0.6B tokenizer ==========================================================================================
  
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B', trust_remote_code=True)  
  
# Tokenize your URL  
tokens = tokenizer.tokenize(url)  
token_ids = tokenizer.encode(url)  
  
print("Qwen/Qwen3-0.6B")
print("Tokens:", tokens)  
print("Token IDs:", token_ids)  
print("Decoded back:", tokenizer.decode(token_ids))

# GPT4o tokenizer ==============================================================================================
enc = tiktoken.get_encoding("o200k_base")

gpt4o_token_ids = enc.encode(url)
gpt4o_tokens = [enc.decode([tid]) for tid in gpt4o_token_ids]

print("\nGPT-4o (tiktoken o200k_base)")
print("Tokens:", gpt4o_tokens)
print("Token IDs:", gpt4o_token_ids)
print("Decoded back:", enc.decode(gpt4o_token_ids))

# BERTimbau tokenizer ==========================================================================================
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', trust_remote_code=True)  
  
# Tokenize your URL  
tokens = tokenizer.tokenize(url)  
token_ids = tokenizer.encode(url)  
  
print("\nneuralmind/bert-base-portuguese-cased")
print("Tokens:", tokens)  
print("Token IDs:", token_ids)  
print("Decoded back:", tokenizer.decode(token_ids))
