# Machine Learning - Plots

### Libraries

In [135]:
from utils import *
from load_data import *
from process_data import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy.stats import linregress
import seaborn as sns

### Load Training Data

In [None]:
# Load the Excel file
path = "Training.xlsx"
df = pd.read_excel(path)
len(df)

In [None]:
# Filter dataset for only 5-fold CV
df = df[df["Cross_Validation"] == 5]
len(df)

In [None]:
# Filter dataset for models != svc
df = df[df["Model"] != "svc"]
len(df)

In [None]:
# Filter dataset for balance != downsampling
df = df[df["Balance"] != "downsampling"]
len(df)

In [140]:
# Save cleaned dataset to excel file
df.to_excel('ML Experiments/Training.xlsx', index=False, sheet_name='Data', engine='openpyxl')

### Análisis General

In [None]:
# Filter dataset for "Análisis General"
ageneral_df = df[df["Type"] == "analisis_general"]
len(ageneral_df)

In [None]:
# Convert 'Timestamp' column to datetime if it's not already
ageneral_df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Sort the DataFrame by 'Timestamp' if needed
ageneral_df.sort_values('Timestamp', inplace=True)

# Resetting the index to ensure it's sequential
ageneral_df.reset_index(drop=True, inplace=True)

In [None]:
# Create a numerical "time" column for regression
ageneral_df['Time'] = np.arange(len(ageneral_df))

# Compute the regression
slope, intercept, r_value, p_value, std_err = linregress(ageneral_df['Time'], ageneral_df['weighted avg_f1-score'])
ageneral_df['Trend'] = intercept + slope * ageneral_df['Time']

In [None]:
# List of specific dates for vertical lines
specific_dates = ['2024-02-12 17:56:54', '2024-03-31 11:47:07', '2024-04-13 11:48:49', '2024-04-24 19:00:45', '2024-04-25 22:49:40', '2024-05-03 13:46:23', '2024-05-09 23:11:41', '2024-05-11 01:02:48', '2024-05-11 19:26:39', '2024-07-03 18:19:45','2024-07-08 12:27:11']
specific_dates = pd.to_datetime(specific_dates)  # Convert to datetime if not already

# Find the indices of the specific dates in the DataFrame
specific_indices = ageneral_df[ageneral_df['Timestamp'].isin(specific_dates)].index.tolist()
specific_indices

In [145]:
# Define the list of labels and colors
labels = ['Comentario Positivo', 'Comentario Negativo']
colors = ['darkgreen', 'darkred']

#### Average Metrics in Windows

In [None]:
# Define the columns you want to calculate the mean for
columns_to_mean = ['Accuracy_Global', 'Std_Global', 'Time (s)', 'weighted avg_precision', 'weighted avg_recall', 'weighted avg_f1-score', 'Comentario Positivo_f1-score', 'Comentario Negativo_f1-score']

# Calculate mean for each window
for i in range(len(specific_indices) - 1):
    start_idx = specific_indices[i]
    end_idx = specific_indices[i + 1]
    
    window_data = ageneral_df.loc[start_idx:end_idx]
    means = window_data[columns_to_mean].mean().round(3)
    print(f'W{i + 1} Average metrics:')
    print(means)
    print()

#### F1 Score

In [None]:
from matplotlib.dates import DateFormatter

plt.figure(figsize=(24, 10))



# Setting x-ticks to specific dates
plt.xticks(ticks=specific_indices, labels=[date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=22)


# Add a vertical line for best window
for date, index in zip(specific_dates, specific_indices):
    if index in [235, 254]:
        plt.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
    else:
        plt.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Plotting the F1-Score as dots
plt.plot(ageneral_df.index, ageneral_df['weighted avg_f1-score'], linewidth=6, linestyle='-', marker='o', markersize=14, color='black', alpha=0.7, label='Weighted Avg. F1-Score')


# Calculate and plot trend line
slope, intercept, r_value, p_value, std_err = linregress(ageneral_df.index, ageneral_df['weighted avg_f1-score'])
trend = intercept + slope * ageneral_df.index
plt.plot(ageneral_df.index, trend, linewidth=2, color='darkred', linestyle='-', alpha=0.5, label=f'Trend Line (R² = {r_value**2:.2f})')

# Move legend to the top
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=2, fontsize=22)


# Setting labels and title
plt.title('Weighted Avg. F1-Score for "Análisis General"', fontsize=42)
# plt.xlabel('Date', fontsize=30)
plt.ylabel('F1-Score', fontsize=30)
plt.gca().set_ylim(0.3, 1)
plt.gca().tick_params(axis='y', labelsize=22)

# Add a grid to improve readability
plt.grid(axis='y', linestyle='--', alpha=0.7)


# Remove spines for a clean look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

# Show the plot
plt.tight_layout()

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Análisis General/ML/Experiments in _Análisis General_ over time.pdf", format='pdf')

# Save the plot as a PNG file
plt.savefig("../../../IMAGES/Análisis General/ML/Experiments in _Análisis General_ over time.png", format='png', transparent=True)

plt.show()


In [None]:
from matplotlib.dates import DateFormatter

plt.figure(figsize=(24, 12))

# Setting x-ticks to specific dates
plt.xticks(ticks=specific_indices, labels=[date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=22)


# Add a vertical line for each specific index
for date, index in zip(specific_dates, specific_indices):
    if index in [235, 254]:
        plt.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
    else:
        plt.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Plotting the F1-Score as dots
plt.plot(ageneral_df.index, ageneral_df['Comentario Negativo_f1-score'], linewidth=6, linestyle='-', marker='o', markersize=14, color=colors[1], alpha=0.5, label='Comentario Negativo')
plt.plot(ageneral_df.index, ageneral_df['Comentario Positivo_f1-score'], linewidth=6, linestyle='-', marker='o', markersize=14, color=colors[0], alpha=0.5, label='Comentario Positivo')
#plt.plot(ageneral_df.index, ageneral_df['Comentario Neutro_f1-score'], linewidth=4, linestyle='-', marker='o', markersize=9, color='gray', alpha=0.2, label='Comentario Neutro')


# Calculate and plot trend line for 'Comentario Negativo'
slope_neg, intercept_neg, r_value_neg, p_value_neg, std_err_neg = linregress(ageneral_df.index, ageneral_df['Comentario Negativo_f1-score'])
trend_neg = intercept_neg + slope_neg * ageneral_df.index
plt.plot(ageneral_df.index, trend_neg, linewidth=2, color='red', linestyle='-', alpha=0.5, label=f'Trend Line Negativo (R² = {r_value_neg**2:.2f})')

# Calculate and plot trend line for 'Comentario Positivo'
slope_pos, intercept_pos, r_value_pos, p_value_pos, std_err_pos = linregress(ageneral_df.index, ageneral_df['Comentario Positivo_f1-score'])
trend_pos = intercept_pos + slope_pos * ageneral_df.index
plt.plot(ageneral_df.index, trend_pos, linewidth=2, color='green', linestyle='-', alpha=0.5, label=f'Trend Line Positivo (R² = {r_value_pos**2:.2f})')

plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=4, fontsize=22)


# Setting labels and title
plt.title('F1-Score for "Análisis General" per Class', fontsize=42)
# plt.xlabel('Date', fontsize=30)
plt.ylabel('F1-Score', fontsize=30)
plt.gca().set_ylim(0.25, 1.1)
plt.gca().tick_params(axis='y', labelsize=18)

# Add a grid to improve readability
plt.grid(axis='y', linestyle='--', alpha=0.7)


# Remove spines for a clean look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)


# Show the plot
plt.tight_layout()

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Análisis General/ML/Experiments in _Análisis General_ over time_classes_trendline2.pdf", format='pdf')

# Save the plot as a PNG file
plt.savefig("../../../IMAGES/Análisis General/ML/Experiments in _Análisis General_ over time_classes_trendline2.png", format='png', transparent=True)

plt.show()


In [None]:
# Create subplots
fig, axes = plt.subplots(len(labels), 1, figsize=(24, len(labels) * 8), sharex=True)

# Plot each balance in a separate subplot
for i, label in enumerate(labels):
    ax = axes[i]

    # Plot Weighted F1-Scores for each label
    ax.plot(ageneral_df.index, 
            ageneral_df[f'{label}_f1-score'], 
            linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[i], alpha=0.5)
    
    # Calculate and plot trend line for Weighted F1-Scores
    slope, intercept, r_value, p_value, std_err = linregress(ageneral_df.index, ageneral_df['weighted avg_f1-score'])
    trend = intercept + slope * ageneral_df.index
    ax.plot(ageneral_df.index, trend, linewidth=2, color='red', linestyle='-', alpha=0.3, label=f'Trend Line (R² = {r_value**2:.2f})')
    
    
    for j in range(len(specific_indices) - 1):
        # Calculate and plot trend line for each window
        start_idx = specific_indices[j]
        end_idx = specific_indices[j + 1]
        
        window_indices = ageneral_df.index[start_idx:end_idx]
        slope, intercept, r_value, p_value, std_err = linregress(window_indices, ageneral_df[f'{label}_f1-score'][start_idx:end_idx])
        trend = intercept + slope * window_indices
        ax.plot(window_indices, trend, linewidth=2, color=colors[i], linestyle='-', alpha=0.5, label=f'Trend Line W{j} (R² = {r_value**2:.2f})')
    
    ax.set_title(label, fontsize=42)
    ax.set_ylabel('F1-Score', fontsize=30)
    ax.set_ylim(0.25, 1)
    ax.tick_params(axis='y', labelsize=22)
    # ax.legend(loc='lower right', fontsize=10)
    # Add a grid to improve readability
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Remove spines for a clean look
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    

    # Add vertical lines for specific indices
    for date, index in zip(specific_dates, specific_indices):
        if index in [235, 254]:
            ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
            ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

    

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=22)

# Set common labels
# fig.text(0.5, 0.04, 'Date', ha='center', fontsize=22)



# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.97])

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Análisis General/ML/Experiments in _Análisis General_ over time_classes_trendline.pdf", format='pdf')

plt.show()


#### Balance

In [None]:
# Define the list of balances
balances = ['downsampling', 'upsampling', 'smote', 'adasyn']

# Create subplots
fig, axes = plt.subplots(len(balances) + 1, 1, figsize=(24, len(balances) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot for None
ax = axes[0]

# Plot Weighted F1-Scores
line1, = ax.plot(ageneral_df[pd.isna(ageneral_df['Balance'])].index, 
                 ageneral_df[pd.isna(ageneral_df['Balance'])]['weighted avg_f1-score'], 
                 linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')

# Plot "Comentario Positivo" F1-Scores
line2, = ax.plot(ageneral_df[pd.isna(ageneral_df['Balance'])].index, 
                 ageneral_df[pd.isna(ageneral_df['Balance'])]['Comentario Positivo_f1-score'], 
                 linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label='Comentario Positivo')

# Plot "Comentario Negativo" F1-Scores
line3, = ax.plot(ageneral_df[pd.isna(ageneral_df['Balance'])].index, 
                 ageneral_df[pd.isna(ageneral_df['Balance'])]['Comentario Negativo_f1-score'], 
                 linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label='Comentario Negativo')

# Append lines and labels only once
lines.extend([line1, line2, line3])
labels_legend.extend(['Weighted Avg. F1-Score', 'Comentario Positivo', 'Comentario Negativo'])

ax.set_title(f'F1-Score for None', fontsize=20)
ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
ax.grid(True)

# Add vertical lines for best window
for date, index in zip(specific_dates, specific_indices):
    if index in [235, 254]:
        ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
    else:
        ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Plot each balance in a separate subplot
for i, balance in enumerate(balances):
    ax = axes[i + 1]

    # Plot Weighted F1-Scores
    ax.plot(ageneral_df[ageneral_df['Balance'] == balance].index, 
            ageneral_df[ageneral_df['Balance'] == balance]['weighted avg_f1-score'], 
            linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5)
    
    # Plot "Comentario Positivo" F1-Scores
    ax.plot(ageneral_df[ageneral_df['Balance'] == balance].index, 
            ageneral_df[ageneral_df['Balance'] == balance]['Comentario Positivo_f1-score'], 
            linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5)
    
    # Plot "Comentario Negativo" F1-Scores
    ax.plot(ageneral_df[ageneral_df['Balance'] == balance].index, 
            ageneral_df[ageneral_df['Balance'] == balance]['Comentario Negativo_f1-score'], 
            linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5)
    
    ax.set_title(f'F1-Score for "{balance}"', fontsize=20)
    ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
    ax.grid(True)
    
    # Add vertical lines for best window
    for date, index in zip(specific_dates, specific_indices):
        if index in [235, 254]:
            ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
            ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=3, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.95])
plt.show()


#### Embedding

In [None]:
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

# Define the list of embeddings
embeddings = ['fasttext', 'word2vec', 'bow', 'tfidf', 'roberta', 'bert-multi', 'beto-cased', 'beto-uncased', 'xlm-roberta-base', 'text-embedding-3-large']

# Create subplots
fig, axes = plt.subplots(len(embeddings), 1, figsize=(24, len(embeddings) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot each embedding in a separate subplot
for i, embedding in enumerate(embeddings):
    ax = axes[i]

    # Plot Weighted F1-Scores
    line1, = ax.plot(ageneral_df[ageneral_df['Embedding_Name'] == embedding].index, 
                     ageneral_df[ageneral_df['Embedding_Name'] == embedding]['weighted avg_f1-score'], 
                     linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')
    
    # Plot "Comentario Positivo" F1-Scores
    line2, = ax.plot(ageneral_df[ageneral_df['Embedding_Name'] == embedding].index, 
                     ageneral_df[ageneral_df['Embedding_Name'] == embedding]['Comentario Positivo_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label='Comentario Positivo')
    
    # Plot "Comentario Negativo" F1-Scores
    line3, = ax.plot(ageneral_df[ageneral_df['Embedding_Name'] == embedding].index, 
                     ageneral_df[ageneral_df['Embedding_Name'] == embedding]['Comentario Negativo_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label='Comentario Negativo')
    
    ax.set_title(f'F1-Score for "{embedding}"', fontsize=20)
    ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
    ax.grid(True)
    
    # Add lines to the legend only once
    if i == 0:
        lines.extend([line1, line2, line3])
        labels_legend.extend(['Weighted Avg. F1-Score', 'Comentario Positivo', 'Comentario Negativo'])
    
    # Add vertical lines for best window
    for date, index in zip(specific_dates, specific_indices):
        if index in [235, 254]:
            ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
            ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=3, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.97])
plt.show()


#### Model

In [None]:
# Define the list of models
models = ['random_forest', 'logistic_regression', 'xgboost', 'mlp', 'naive_bayes']

# Create subplots
fig, axes = plt.subplots(len(models), 1, figsize=(24, len(models) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot each model in a separate subplot
for i, model in enumerate(models):
    ax = axes[i]

    # Plot Weighted F1-Scores
    line1, = ax.plot(ageneral_df[ageneral_df['Model'] == model].index, 
                     ageneral_df[ageneral_df['Model'] == model]['weighted avg_f1-score'], 
                     linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')
    
    # Plot "Comentario Positivo" F1-Scores
    line2, = ax.plot(ageneral_df[ageneral_df['Model'] == model].index, 
                     ageneral_df[ageneral_df['Model'] == model]['Comentario Positivo_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label='Comentario Positivo')
    
    # Plot "Comentario Negativo" F1-Scores
    line3, = ax.plot(ageneral_df[ageneral_df['Model'] == model].index, 
                     ageneral_df[ageneral_df['Model'] == model]['Comentario Negativo_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label='Comentario Negativo')
    
    ax.set_title(f'F1-Score for "{model}"', fontsize=20)
    ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
    ax.grid(True)
    
    # Add lines to the legend only once
    if i == 0:
        lines.extend([line1, line2, line3])
        labels_legend.extend(['Weighted Avg. F1-Score', 'Comentario Positivo', 'Comentario Negativo'])
    
    # Add vertical lines for best window
    for date, index in zip(specific_dates, specific_indices):
        if index in [235, 254]:
            ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
            ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=3, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.95])
plt.show()


#### Performance (Time)

In [None]:
# Replace NaN values in the 'Balance' column with 'None'
from matplotlib.gridspec import GridSpec


ageneral_df['Balance'].fillna('None', inplace=True)

# Set Seaborn style
sns.set(style="whitegrid")

# Calculate the upper limit for the y-axis dynamically
y_max = ageneral_df['Time (s)'].mean() + 3 * ageneral_df['Time (s)'].std()

# Create a figure with subplots using GridSpec
fig = plt.figure(figsize=(24, 16))
gs = GridSpec(2, 2, figure=fig, width_ratios=[1, 1], height_ratios=[1, 1])

# Plot for MODELS
ax1 = fig.add_subplot(gs[0, 0])
sns.barplot(ax=ax1, x='Model', y='Time (s)', data=ageneral_df, palette='crest_r', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax1.set_title('Average Performance Time for Different Models with 95% CI', fontsize=20)
ax1.set_xlabel('Model', fontsize=16)
ax1.set_ylabel('Average Time (s)', fontsize=16)
ax1.tick_params(axis='x', labelsize=14)
ax1.tick_params(axis='y', labelsize=14)
ax1.set_ylim(0, y_max)
ax1.grid(axis='y', linestyle='--', alpha=0.7)

# Plot for BALANCE
ax2 = fig.add_subplot(gs[0, 1])
sns.barplot(ax=ax2, x='Balance', y='Time (s)', data=ageneral_df, palette='crest_r', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax2.set_title('Average Performance Time for Different Balancing Techniques with 95% CI', fontsize=20)
ax2.set_xlabel('Balance', fontsize=16)
ax2.set_ylabel('Average Time (s)', fontsize=16)
ax2.tick_params(axis='x', labelsize=14)
ax2.tick_params(axis='y', labelsize=14)
ax2.set_ylim(0, y_max)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Plot for EMBEDDINGS
ax3 = fig.add_subplot(gs[1, :])
sns.barplot(ax=ax3, x='Embedding_Name', y='Time (s)', data=ageneral_df, palette='crest_r', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax3.set_title('Average Performance Time for Different Embeddings with 95% CI', fontsize=20)
ax3.set_xlabel('Embedding', fontsize=16)
ax3.set_ylabel('Average Time (s)', fontsize=16)
ax3.tick_params(axis='x', labelsize=14, rotation=45)
ax3.tick_params(axis='y', labelsize=14)
ax3.set_ylim(0, y_max)
ax3.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust layout to avoid overlap
plt.tight_layout()
plt.show()


In [None]:
# Calculate the correlation matrix
correlation_matrix = ageneral_df[['weighted avg_f1-score', 'Comentario Positivo_f1-score', 'Comentario Negativo_f1-score', 'Accuracy_Global', 'Std_Global','Time (s)']].corr()

# Set Seaborn style
sns.set(style="whitegrid")

# Create a heatmap for the correlation matrix
plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='crest', fmt='.2f', linewidths=0.5, linecolor='black')

# Set title and labels
plt.title('Correlation Matrix of Performance Metrics', fontsize=20)

# Show the plot
plt.tight_layout()
plt.show()


#### Confussion Matrix - Best Model

In [None]:
from sklearn.metrics import confusion_matrix


# Datos de la matriz de confusión for best model '1bd4b955-1c5e-45a9-9259-101c5218e230'
y_true = ['Comentario Positivo'] * 257 + ['Comentario Negativo'] * 73
y_pred = [
    'Comentario Positivo'] * 247 + ['Comentario Negativo'] * 10 + \
    ['Comentario Positivo'] * 8 + ['Comentario Negativo'] * 65

# Crear la matriz de confusión
cm = confusion_matrix(y_true, y_pred, labels=['Comentario Positivo', 'Comentario Negativo'])

# Plotear la matriz de confusión
plt.figure(figsize=(24, 18))

sns.heatmap(cm, annot=True, fmt='d', cmap='crest', xticklabels=['Comentario Positivo', 'Comentario Negativo'], 
            yticklabels=['Comentario Positivo', 'Comentario Negativo'], annot_kws={"size": 52}, cbar=False)

plt.xlabel('Predicted label', fontsize=30)
plt.ylabel('True label', fontsize=30)
plt.xticks(rotation=0, fontsize=50)
plt.yticks(rotation=0, fontsize=50)
# plt.title('Confusion Matrix for Best Model of ML Experiments')

plt.tight_layout()

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Análisis General/ML/Experiments in _Análisis General_ best_model.pdf", format='pdf')

# Save the plot as a PNG file
plt.savefig("../../../IMAGES/Análisis General/ML/Experiments in _Análisis General_ best_model.png", format='png', transparent=True)

plt.show()


### Contenido Negativo

In [None]:
# Filter dataset for "Contenido Negativo"
cnegativo_df = df[df["Type"] == "contenido_negativo"]
len(cnegativo_df)

In [None]:
# Convert 'Timestamp' column to datetime if it's not already
cnegativo_df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Sort the DataFrame by 'Timestamp' if needed
cnegativo_df.sort_values('Timestamp', inplace=True)

# Resetting the index to ensure it's sequential
cnegativo_df.reset_index(drop=True, inplace=True)

In [None]:
# Create a numerical "time" column for regression
cnegativo_df['Time'] = np.arange(len(cnegativo_df))

# Compute the regression
slope, intercept, r_value, p_value, std_err = linregress(cnegativo_df['Time'], cnegativo_df['weighted avg_f1-score'])
cnegativo_df['Trend'] = intercept + slope * cnegativo_df['Time']

In [None]:
# List of specific dates for vertical lines
specific_dates = ['2024-02-24 11:30:09', '2024-04-08 22:47:39' , '2024-04-28 01:18:11' , '2024-05-02 23:45:54', '2024-07-04 08:51:46', '2024-07-09 12:06:38']
specific_dates = pd.to_datetime(specific_dates)  # Convert to datetime if not already

# Find the indices of the specific dates in the DataFrame
specific_indices = cnegativo_df[cnegativo_df['Timestamp'].isin(specific_dates)].index.tolist()
specific_indices

In [160]:
# Define the list of labels and colors
labels = ['Insultos', 'Desprestigiar Acto', 'Desprestigiar Víctima', 'Desprestigiar Deportista Autora']
colors = ['#77E4C8', '#36C2CE', '#478CCF', '#4535C1']

#### Average Metrics in Windows

In [None]:
# Define the columns you want to calculate the mean for
columns_to_mean = ['Accuracy_Global', 'Std_Global', 'Time (s)', 'weighted avg_precision', 'weighted avg_recall', 'weighted avg_f1-score', 'Insultos_f1-score', 'Desprestigiar Acto_f1-score', 'Desprestigiar Víctima_f1-score', 'Desprestigiar Deportista Autora_f1-score']

# Calculate mean for each window
for i in range(len(specific_indices) - 1):
    start_idx = specific_indices[i]
    end_idx = specific_indices[i + 1]
    
    window_data = cnegativo_df.loc[start_idx:end_idx]
    means = window_data[columns_to_mean].mean().round(3)
    print(f'W{i + 1} Average metrics:')
    print(means)
    print()

####  F1-Score

In [None]:
from matplotlib.dates import DateFormatter

plt.figure(figsize=(24, 10))



# Setting x-ticks to specific dates
plt.xticks(ticks=specific_indices, labels=[date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=22)


# Add a vertical line for best window
for date, index in zip(specific_dates, specific_indices):
    if index in [133, 157]:
        plt.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
    else:
        plt.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Plotting the F1-Score as dots
plt.plot(cnegativo_df.index, cnegativo_df['weighted avg_f1-score'], linewidth=6, linestyle='-', marker='o', markersize=14, color='black', alpha=0.7, label='Weighted Avg. F1-Score')


# Calculate and plot trend line
slope, intercept, r_value, p_value, std_err = linregress(cnegativo_df.index, cnegativo_df['weighted avg_f1-score'])
trend = intercept + slope * cnegativo_df.index
plt.plot(cnegativo_df.index, trend, linewidth=2, color='darkred', linestyle='-', alpha=0.5, label=f'Trend Line (R² = {r_value**2:.2f})')


# Move legend to the top
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=2, fontsize=22)


# Setting labels and title
plt.title('Weighted Avg. F1-Score for "Contenido Negativo"', fontsize=42)
# plt.xlabel('Date', fontsize=18)
plt.ylabel('F1-Score', fontsize=30)
plt.gca().set_ylim(0.3, 0.7)
plt.gca().tick_params(axis='y', labelsize=22)

# Add a grid to improve readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Remove spines for a clean look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

# Show the plot
plt.tight_layout()

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Contenido Negativo/ML/Experiments in _ContenidoNegativo_ over time.pdf", format='pdf')

plt.show()


In [None]:
from matplotlib.dates import DateFormatter

plt.figure(figsize=(24, 8))



# Setting x-ticks to specific dates
plt.xticks(ticks=specific_indices, labels=[date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)


# Add a vertical line for each specific index
for date, index in zip(specific_dates, specific_indices):
    if index in [160, 184]:
        plt.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
    else:
        plt.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Plotting the F1-Score as dots
plt.plot(cnegativo_df.index, cnegativo_df['Insultos_f1-score'], linewidth=6, linestyle='-', marker='o', markersize=14, color=colors[0], alpha=0.5, label='Insultos')
plt.plot(cnegativo_df.index, cnegativo_df['Desprestigiar Acto_f1-score'], linewidth=6, linestyle='-', marker='o', markersize=14, color=colors[1], alpha=0.5, label='Desprestigiar Acto')
plt.plot(cnegativo_df.index, cnegativo_df['Desprestigiar Víctima_f1-score'], linewidth=6, linestyle='-', marker='o', markersize=14, color=colors[2], alpha=0.5, label='Desprestigiar Víctima')
plt.plot(cnegativo_df.index, cnegativo_df['Desprestigiar Deportista Autora_f1-score'], linewidth=6, linestyle='-', marker='o', markersize=14, color=colors[3], alpha=0.5, label='Desprestigiar Deportista Autora')

plt.legend(loc='lower right', fontsize=18)


# Setting labels and title
plt.title('F1-Score for "Contenido Negativo" over time per class', fontsize=30)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Weighted Avg. F1-Score', fontsize=18)




# Adding grid for better readability
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Create subplots
fig, axes = plt.subplots(len(labels), 1, figsize=(24, len(labels) * 8), sharex=True)

# Plot each balance in a separate subplot
for i, label in enumerate(labels):
    ax = axes[i]

    # Plot Weighted F1-Scores for each label
    ax.plot(cnegativo_df.index, 
            cnegativo_df[f'{label}_f1-score'], 
            linewidth=6, linestyle='-', marker='o', markersize=14, color=colors[i], alpha=0.5)
    
    # Calculate and plot trend line for Weighted F1-Scores
    slope, intercept, r_value, p_value, std_err = linregress(cnegativo_df.index, cnegativo_df['weighted avg_f1-score'])
    trend = intercept + slope * cnegativo_df.index
    ax.plot(cnegativo_df.index, trend, linewidth=2, color='red', linestyle='-', alpha=0.3, label=f' W. Avg. Trend Line (R² = {r_value**2:.2f})')
    
    
    for j in range(len(specific_indices) - 1):
        # Calculate and plot trend line for each window
        start_idx = specific_indices[j]
        end_idx = specific_indices[j + 1]
        
        window_indices = cnegativo_df.index[start_idx:end_idx]
        slope, intercept, r_value, p_value, std_err = linregress(window_indices, cnegativo_df[f'{label}_f1-score'][start_idx:end_idx])
        trend = intercept + slope * window_indices
        ax.plot(window_indices, trend, linewidth=2, color=colors[i], linestyle='-', alpha=0.5, label=f'Trend Line W{j+1} (R² = {r_value**2:.2f})')
    
    ax.set_title(label, fontsize=42)
    ax.set_ylabel('F1-Score', fontsize=30)
    ax.set_ylim(0.25, 1)
    ax.tick_params(axis='y', labelsize=22)


    # Add a grid to improve readability
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Remove spines for a clean look
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    # Set y-axis limits
    ax.set_ylim(-0.05, 0.84)

    # Add vertical lines for specific indices
    for date, index in zip(specific_dates, specific_indices):
        if index in [160, 184]:
            ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
            ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
# fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.97])

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Contenido Negativo/ML/Experiments in _ContenidoNegativo_ over time_ perclass_ trendline.pdf", format='pdf')

# Save the plot as a PNG file
plt.savefig("../../../IMAGES/Contenido Negativo/ML/Experiments in _ContenidoNegativo_ over time_ perclass_ trendline.png", format='png', transparent=True)

plt.show()


#### Balance

In [None]:
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

# Define the list of balances
balances = ['downsampling', 'upsampling', 'smote', 'adasyn']

# Create subplots
fig, axes = plt.subplots(len(balances)+1, 1, figsize=(24, len(balances) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot for None
ax = axes[0]

# Plot Weighted F1-Scores
line1, = ax.plot(cnegativo_df[pd.isna(cnegativo_df['Balance'])].index, 
                 cnegativo_df[pd.isna(cnegativo_df['Balance'])]['weighted avg_f1-score'], 
                 linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')

# Plot "Insultos" F1-Scores
line2, = ax.plot(cnegativo_df[pd.isna(cnegativo_df['Balance'])].index, 
                 cnegativo_df[pd.isna(cnegativo_df['Balance'])][f'{labels[0]}_f1-score'], 
                 linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label=labels[0])

# Plot "Desprestigiar Acto" F1-Scores
line3, = ax.plot(cnegativo_df[pd.isna(cnegativo_df['Balance'])].index, 
                 cnegativo_df[pd.isna(cnegativo_df['Balance'])][f'{labels[1]}_f1-score'], 
                 linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label=labels[1])

# Plot "Desprestigiar Víctima" F1-Scores
line4, = ax.plot(cnegativo_df[pd.isna(cnegativo_df['Balance'])].index, 
                 cnegativo_df[pd.isna(cnegativo_df['Balance'])][f'{labels[2]}_f1-score'], 
                 linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[2], alpha=0.5, label=labels[2])

# Plot "Desprestigiar Deportista Autora" F1-Scores
line5, = ax.plot(cnegativo_df[pd.isna(cnegativo_df['Balance'])].index, 
                 cnegativo_df[pd.isna(cnegativo_df['Balance'])][f'{labels[3]}_f1-score'], 
                 linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[3], alpha=0.5, label=labels[3])

# Append lines and labels only once
lines.extend([line1, line2, line3, line4, line5])
labels_legend.extend(['Weighted Avg. F1-Score', labels[0], labels[1], labels[2], labels[3]])

ax.set_title('F1-Score for None', fontsize=20)
ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
ax.grid(True)

# Add vertical lines for specific indices
for date, index in zip(specific_dates, specific_indices):
        ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Add a vertical line for best window
for date, index in zip(specific_dates, specific_indices):
        if index in [133, 157]:
                ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
                ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Plot each balance in a separate subplot
for i, balance in enumerate(balances):
        ax = axes[i+1]

        # Plot Weighted F1-Scores
        ax.plot(cnegativo_df[cnegativo_df['Balance'] == balance].index, 
                cnegativo_df[cnegativo_df['Balance'] == balance]['weighted avg_f1-score'], 
                linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5)
        
        # Plot "Insultos" F1-Scores
        ax.plot(cnegativo_df[cnegativo_df['Balance'] == balance].index, 
                cnegativo_df[cnegativo_df['Balance'] == balance][f'{labels[0]}_f1-score'], 
                linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5)
        
        # Plot "Desprestigiar Acto" F1-Scores
        ax.plot(cnegativo_df[cnegativo_df['Balance'] == balance].index, 
                cnegativo_df[cnegativo_df['Balance'] == balance][f'{labels[1]}_f1-score'], 
                linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5)
        
        # Plot "Desprestigiar Víctima" F1-Scores
        ax.plot(cnegativo_df[cnegativo_df['Balance'] == balance].index, 
                cnegativo_df[cnegativo_df['Balance'] == balance][f'{labels[2]}_f1-score'], 
                linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[2], alpha=0.5)
        
        # Plot "Desprestigiar Deportista Autora" F1-Scores
        ax.plot(cnegativo_df[cnegativo_df['Balance'] == balance].index, 
                cnegativo_df[cnegativo_df['Balance'] == balance][f'{labels[3]}_f1-score'], 
                linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[3], alpha=0.5)

        ax.set_title(f'F1-Score for "{balance}"', fontsize=20)
        ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
        ax.grid(True)
        
        # Add vertical lines for specific indices
        for date, index in zip(specific_dates, specific_indices):
                ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

        # Add a vertical line for best window
        for date, index in zip(specific_dates, specific_indices):
                if index in [133, 157]:
                        ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
                else:
                        ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=5, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.95])
plt.show()


#### Embedding

In [None]:
# Define the list of embeddings
embeddings = ['fasttext', 'word2vec', 'bow', 'tfidf', 'roberta', 'bert-multi', 'beto-cased', 'beto-uncased', 'xlm-roberta-base', 'text-embedding-3-large']

# Create subplots
fig, axes = plt.subplots(len(embeddings), 1, figsize=(24, len(embeddings) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot each embedding in a separate subplot
for i, embedding in enumerate(embeddings):
    ax = axes[i]

    # Plot Weighted F1-Scores
    line1, = ax.plot(cnegativo_df[cnegativo_df['Embedding_Name'] == embedding].index, 
                     cnegativo_df[cnegativo_df['Embedding_Name'] == embedding]['weighted avg_f1-score'], 
                     linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')
    
    # Plot "Insultos" F1-Scores
    line2, = ax.plot(cnegativo_df[cnegativo_df['Embedding_Name'] == embedding].index, 
                     cnegativo_df[cnegativo_df['Embedding_Name'] == embedding][f'{labels[0]}_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label=labels[0])
    
    # Plot "Desprestigiar Acto" F1-Scores
    line3, = ax.plot(cnegativo_df[cnegativo_df['Embedding_Name'] == embedding].index, 
                     cnegativo_df[cnegativo_df['Embedding_Name'] == embedding][f'{labels[1]}_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label=labels[1])
    
    # Plot "Desprestigiar Víctima" F1-Scores
    line4, = ax.plot(cnegativo_df[cnegativo_df['Embedding_Name'] == embedding].index, 
                     cnegativo_df[cnegativo_df['Embedding_Name'] == embedding][f'{labels[2]}_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[2], alpha=0.5, label=labels[2])
    
    # Plot "Desprestigiar Deportista Autora" F1-Scores
    line5, = ax.plot(cnegativo_df[cnegativo_df['Embedding_Name'] == embedding].index, 
                     cnegativo_df[cnegativo_df['Embedding_Name'] == embedding][f'{labels[3]}_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[3], alpha=0.5, label=labels[3])
    
    # Append lines and labels only once
    if i == 0:
        lines.extend([line1, line2, line3, line4, line5])
        labels_legend.extend(['Weighted Avg. F1-Score', labels[0], labels[1], labels[2], labels[3]])
    
    ax.set_title(f'F1-Score for "{embedding}"', fontsize=20)
    ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
    ax.grid(True)
    
    # Add vertical lines for specific indices
    for date, index in zip(specific_dates, specific_indices):
        ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

    # Add a vertical line for best window
    for date, index in zip(specific_dates, specific_indices):
        if index in [133, 157]:
                ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
                ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=5, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.97])
plt.show()


#### Model

In [None]:
# Define the list of embeddings
models = ['random_forest', 'logistic_regression', 'xgboost', 'mlp', 'naive_bayes']

# Create subplots
fig, axes = plt.subplots(len(models), 1, figsize=(24, len(models) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot each model in a separate subplot
for i, model in enumerate(models):
    ax = axes[i]

    # Plot Weighted F1-Scores
    line1, = ax.plot(cnegativo_df[cnegativo_df['Model'] == model].index, 
                     cnegativo_df[cnegativo_df['Model'] == model]['weighted avg_f1-score'], 
                     linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')
    
    # Plot "Insultos" F1-Scores
    line2, = ax.plot(cnegativo_df[cnegativo_df['Model'] == model].index, 
                     cnegativo_df[cnegativo_df['Model'] == model][f'{labels[0]}_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label=labels[0])
    
    # Plot "Desprestigiar Acto" F1-Scores
    line3, = ax.plot(cnegativo_df[cnegativo_df['Model'] == model].index, 
                     cnegativo_df[cnegativo_df['Model'] == model][f'{labels[1]}_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label=labels[1])
    
    # Plot "Desprestigiar Víctima" F1-Scores
    line4, = ax.plot(cnegativo_df[cnegativo_df['Model'] == model].index, 
                     cnegativo_df[cnegativo_df['Model'] == model][f'{labels[2]}_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[2], alpha=0.5, label=labels[2])
    
    # Plot "Desprestigiar Deportista Autora" F1-Scores
    line5, = ax.plot(cnegativo_df[cnegativo_df['Model'] == model].index, 
                     cnegativo_df[cnegativo_df['Model'] == model][f'{labels[3]}_f1-score'], 
                     linewidth=4, linestyle='-', marker='o', markersize=9, color=colors[3], alpha=0.5, label=labels[3])
    
    # Append lines and labels only once
    if i == 0:
        lines.extend([line1, line2, line3, line4, line5])
        labels_legend.extend(['Weighted Avg. F1-Score', labels[0], labels[1], labels[2], labels[3]])
    
    ax.set_title(f'F1-Score for "{model}"', fontsize=20)
    ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
    ax.grid(True)
    
    # Add vertical lines for specific indices
    for date, index in zip(specific_dates, specific_indices):
        ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

    # Add a vertical line for best window
    for date, index in zip(specific_dates, specific_indices):
        if index in [133, 157]:
                ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
                ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=5, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.95])
plt.show()

#### Performance (Time)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.gridspec import GridSpec

# Replace NaN values in the 'Balance' column with 'None'
cnegativo_df['Balance'].fillna('None', inplace=True)

# Set Seaborn style
sns.set(style="whitegrid")

# Calculate the upper limit for the y-axis dynamically
y_max = cnegativo_df['Time (s)'].mean() + 3 * cnegativo_df['Time (s)'].std()

# Create a figure with subplots using GridSpec
fig = plt.figure(figsize=(24, 16))
gs = GridSpec(2, 2, figure=fig, width_ratios=[1, 1], height_ratios=[1, 1])

# Plot for MODELS
ax1 = fig.add_subplot(gs[0, 0])
sns.barplot(ax=ax1, x='Model', y='Time (s)', data=cnegativo_df, palette='Blues', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax1.set_title('Average Performance Time for Different Models with 95% CI', fontsize=20)
ax1.set_xlabel('Model', fontsize=16)
ax1.set_ylabel('Average Time (s)', fontsize=16)
ax1.tick_params(axis='x', labelsize=14)
ax1.tick_params(axis='y', labelsize=14)
ax1.set_ylim(0, y_max)
ax1.grid(axis='y', linestyle='--', alpha=0.7)

# Plot for BALANCE
ax2 = fig.add_subplot(gs[0, 1])
sns.barplot(ax=ax2, x='Balance', y='Time (s)', data=cnegativo_df, palette='Blues', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax2.set_title('Average Performance Time for Different Balancing Techniques with 95% CI', fontsize=20)
ax2.set_xlabel('Balance', fontsize=16)
ax2.set_ylabel('Average Time (s)', fontsize=16)
ax2.tick_params(axis='x', labelsize=14)
ax2.tick_params(axis='y', labelsize=14)
ax2.set_ylim(0, y_max)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Plot for EMBEDDINGS
ax3 = fig.add_subplot(gs[1, :])
sns.barplot(ax=ax3, x='Embedding_Name', y='Time (s)', data=cnegativo_df, palette='Blues', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax3.set_title('Average Performance Time for Different Embeddings with 95% CI', fontsize=20)
ax3.set_xlabel('Embedding', fontsize=16)
ax3.set_ylabel('Average Time (s)', fontsize=16)
ax3.tick_params(axis='x', labelsize=14, rotation=45)
ax3.tick_params(axis='y', labelsize=14)
ax3.set_ylim(0, y_max)
ax3.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust layout to avoid overlap
plt.tight_layout()
plt.show()


In [None]:
# Calculate the correlation matrix
correlation_matrix = cnegativo_df[['weighted avg_f1-score', f'{labels[0]}_f1-score', f'{labels[1]}_f1-score', f'{labels[2]}_f1-score', f'{labels[3]}_f1-score', 'Accuracy_Global', 'Std_Global','Time (s)']].corr()

# Set Seaborn style
sns.set(style="whitegrid")

# Create a heatmap for the correlation matrix
plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='Blues', fmt='.2f', linewidths=0.5, linecolor='black')

# Set title and labels
plt.title('Correlation Matrix of Performance Metrics', fontsize=20)

# Show the plot
plt.tight_layout()
plt.show()


#### Confussion Matrix - Best Model

In [None]:
from sklearn.metrics import confusion_matrix

# Datos de la matriz de confusión para el modelo 'e05b4d04-f884-4688-aa9b-6f097725c4c9'
y_true = ['Desprestigiar Víctima'] * 14 + ['Desprestigiar Acto'] * 15 + ['Insultos'] * 12 + ['Desprestigiar Deportista Autora'] * 30
y_pred = [
    'Desprestigiar Víctima'] * 10 + ['Desprestigiar Acto'] * 1 + ['Insultos'] * 1 + ['Desprestigiar Deportista Autora'] * 2 + \
    ['Desprestigiar Víctima'] * 2 + ['Desprestigiar Acto'] * 7 + ['Insultos'] * 3 + ['Desprestigiar Deportista Autora'] * 3 + \
    ['Desprestigiar Víctima'] * 3 + ['Desprestigiar Acto'] * 2 + ['Insultos'] * 5 + ['Desprestigiar Deportista Autora'] * 2 + \
    ['Desprestigiar Víctima'] * 2 + ['Desprestigiar Acto'] * 4 + ['Insultos'] * 1 + ['Desprestigiar Deportista Autora'] * 23


# Crear la matriz de confusión
cm = confusion_matrix(y_true, y_pred, labels=['Desprestigiar Víctima', 'Desprestigiar Acto', 'Insultos', 'Desprestigiar Deportista Autora'])

# Plotear la matriz de confusión
plt.figure(figsize=(32, 30))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Desprestigiar Víctima', 'Desprestigiar Acto', 'Insultos', 'Desprestigiar Deportista Autora'], 
            yticklabels=['Desprestigiar Víctima', 'Desprestigiar Acto', 'Insultos', 'Desprestigiar Deportista Autora'], annot_kws={"size": 52}, cbar=False)

plt.xlabel('Predicted label', fontsize=30)
plt.ylabel('True label', fontsize=30)
plt.xticks(rotation=45, fontsize=46)
plt.yticks(rotation=0, fontsize=46)
# plt.title('Confusion Matrix for Best Model of ML Experiments')

plt.tight_layout()

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Contenido Negativo/ML/Experiments in _ContenidoNegativo_ best_model.pdf", format='pdf')

# Save the plot as a PNG file
plt.savefig("../../../IMAGES/Contenido Negativo/ML/Experiments in _ContenidoNegativo_ best_model.png", format='png', transparent=True)

plt.show()


### Insultos

In [None]:
# Filter dataset for "Insultos"
insultos_df = df[df["Type"] == "insultos"]
len(insultos_df)

In [None]:
# Convert 'Timestamp' column to datetime if it's not already
insultos_df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Sort the DataFrame by 'Timestamp' if needed
insultos_df.sort_values('Timestamp', inplace=True)

# Resetting the index to ensure it's sequential
insultos_df.reset_index(drop=True, inplace=True)

In [None]:
# Create a numerical "time" column for regression
insultos_df['Time'] = np.arange(len(insultos_df))

# Compute the regression
slope, intercept, r_value, p_value, std_err = linregress(insultos_df['Time'], insultos_df['weighted avg_f1-score'])
insultos_df['Trend'] = intercept + slope * insultos_df['Time']

In [None]:
# List of specific dates for vertical lines
# specific_dates = ['2023-12-13 16:16:37', '2024-01-31 15:55:05', '2024-02-29 00:46:40', '2024-03-09 16:39:43', '2024-04-07 20:15:24', '2024-04-23 19:56:53', '2024-05-04 16:11:03', '2024-07-04 21:32:50', '2024-07-09 21:33:27']
specific_dates = ['2023-12-13 16:16:37', '2024-03-09 16:39:43', '2024-03-10 11:23:40', '2024-03-12 17:23:08', '2024-04-07 20:15:24', '2024-04-29 20:58:01', '2024-04-30 23:00:19', '2024-05-06 09:34:07', '2024-05-06 21:51:38', '2024-05-07 21:51:00', '2024-07-04 21:32:50', '2024-07-09 21:33:27'] 
specific_dates = pd.to_datetime(specific_dates)  # Convert to datetime if not already
print(f"{specific_dates=}")

# Find the indices of the specific dates in the DataFrame
specific_indices = insultos_df[insultos_df['Timestamp'].isin(specific_dates)].index.tolist()
print(f"{specific_indices=}")

In [175]:
# Define the list of labels and colors
labels = ['Sexistas/misóginos', 'Genéricos', 'Deseo de Dañar']
colors = ['#FFAF45', '#FB6D48', '#D74B76']

#### Average Metrics in Windows

In [None]:
# Define the columns you want to calculate the mean for
columns_to_mean = ['Accuracy_Global', 'Std_Global', 'Time (s)', 'weighted avg_precision', 'weighted avg_recall', 'weighted avg_f1-score', 'Sexistas/misóginos_f1-score', 'Genéricos_f1-score', 'Deseo de Dañar_f1-score']

# Calculate mean for each window
for i in range(len(specific_indices) - 1):
    start_idx = specific_indices[i]
    end_idx = specific_indices[i + 1]
    
    window_data = insultos_df.loc[start_idx:end_idx]
    means = window_data[columns_to_mean].mean().round(3)
    print(f'W{i + 1} Average metrics:')
    print(means)
    print()

####  F1-Score

In [None]:
plt.figure(figsize=(24, 12))

# Setting x-ticks to specific dates
plt.xticks(ticks=specific_indices, labels=[date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)


# Add a vertical line for best window
for date, index in zip(specific_dates, specific_indices):
    if index in [688, 712]:
        plt.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
    else:
        plt.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Plotting the F1-Score as dots
plt.plot(insultos_df.index, insultos_df['weighted avg_f1-score'], linewidth=4, linestyle='-', marker='o', markersize=10, color='black', alpha=0.7, label='Weighted Avg. F1-Score')


# Calculate and plot trend line
slope, intercept, r_value, p_value, std_err = linregress(insultos_df.index, insultos_df['weighted avg_f1-score'])
trend = intercept + slope * insultos_df.index
plt.plot(insultos_df.index, trend, linewidth=2, color='darkred', linestyle='-', alpha=0.5, label=f'Trend Line (R² = {r_value**2:.2f})')


# Move legend to the top
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=2, fontsize=22)


# Setting labels and title
plt.title('Weighted Avg. F1-Score for "Insultos"', fontsize=42)
# plt.xlabel('Date', fontsize=30)
plt.ylabel('F1-Score', fontsize=30)
plt.gca().set_ylim(-0.01, 0.85)
plt.gca().tick_params(axis='y', labelsize=22)

# Add a grid to improve readability
plt.grid(axis='y', linestyle='--', alpha=0.7)


# Remove spines for a clean look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

# Show the plot
plt.tight_layout()

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Insultos/ML/Experiments in _Insultos_ over time.pdf", format='pdf')

plt.show()


In [None]:
plt.figure(figsize=(24, 14))

# Setting x-ticks to specific dates
plt.xticks(ticks=specific_indices, labels=[date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Plotting the F1-Score as dots
plt.plot(insultos_df.index, insultos_df[f'{labels[0]}_f1-score'], linewidth=4, linestyle='-', marker='o', markersize=10, color=colors[0], alpha=0.5, label=labels[0])
plt.plot(insultos_df.index, insultos_df[f'{labels[1]}_f1-score'], linewidth=4, linestyle='-', marker='o', markersize=10, color=colors[1], alpha=0.5, label=labels[1])
plt.plot(insultos_df.index, insultos_df[f'{labels[2]}_f1-score'], linewidth=4, linestyle='-', marker='o', markersize=10, color=colors[2], alpha=0.5, label=labels[2])

plt.legend(loc='lower right', fontsize=18)


# Setting labels and title
plt.title('F1-Score for "Insultos" over time per class', fontsize=30)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Weighted Avg. F1-Score', fontsize=18)

# Add a vertical line for each specific index
for date, index in zip(specific_dates, specific_indices):
    # if index in [160, 184]:
    #     plt.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
    # else:
        plt.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)


# Adding grid for better readability
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Create subplots
fig, axes = plt.subplots(len(labels), 1, figsize=(24, len(labels) * 10), sharex=True)

# Plot each balance in a separate subplot
for i, label in enumerate(labels):
    ax = axes[i]

    # Plot Weighted F1-Scores for each label
    ax.plot(insultos_df.index, 
            insultos_df[f'{label}_f1-score'], 
            linewidth=4, linestyle='-', marker='o', markersize=10, color=colors[i], alpha=0.5)
    
    # Calculate and plot trend line for Weighted F1-Scores
    slope, intercept, r_value, p_value, std_err = linregress(insultos_df.index, insultos_df['weighted avg_f1-score'])
    trend = intercept + slope * insultos_df.index
    ax.plot(insultos_df.index, trend, linewidth=2, color='red', linestyle='-', alpha=0.3) # label=f'Trend Line (R² = {r_value**2:.2f})'
    
    
    for j in range(len(specific_indices) - 1):
        # Calculate and plot trend line for each window
        start_idx = specific_indices[j]
        end_idx = specific_indices[j + 1]
        
        window_indices = insultos_df.index[start_idx:end_idx]
        slope, intercept, r_value, p_value, std_err = linregress(window_indices, insultos_df[f'{label}_f1-score'][start_idx:end_idx])
        trend = intercept + slope * window_indices
        ax.plot(window_indices, trend, linewidth=2, color=colors[i], linestyle='-', alpha=0.5) # label=f'Trend Line W{j} (R² = {r_value**2:.2f})'
    
    ax.set_title(label, fontsize=42)
    ax.set_ylabel('F1-Score', fontsize=30)
    ax.set_ylim(-0.05, 0.87)
    ax.tick_params(axis='y', labelsize=22)
    # ax.legend(loc='lower right', fontsize=10)
    # Add a grid to improve readability
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Remove spines for a clean look
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    

    # Add vertical lines for specific indices
    for date, index in zip(specific_dates, specific_indices):
        # if index in [160, 184]:
        #     ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        # else:
            ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=22)

# Set common labels
# fig.text(0.5, 0.04, 'Date', ha='center', fontsize=22)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.97])

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Insultos/ML/Experiments in _Insultos_ over time_classes_trendline.pdf", format='pdf')

# Save the plot as a PNG file
plt.savefig("../../../IMAGES/Insultos/ML/Experiments in _Insultos_ over time_classes_trendline.png", format='png', transparent=True)

plt.show()

#### Balance

In [None]:
# Define the list of balances
balances = ['downsampling', 'upsampling', 'smote', 'adasyn']

# Create subplots
fig, axes = plt.subplots(len(balances)+1, 1, figsize=(24, len(balances) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot for None
ax = axes[0]

# Plot Weighted F1-Scores
line1, = ax.plot(insultos_df[pd.isna(insultos_df['Balance'])].index, 
                 insultos_df[pd.isna(insultos_df['Balance'])]['weighted avg_f1-score'], 
                 linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')

# Plot "Sexistas/misóginos" F1-Scores
line2, = ax.plot(insultos_df[pd.isna(insultos_df['Balance'])].index, 
                 insultos_df[pd.isna(insultos_df['Balance'])][f'{labels[0]}_f1-score'], 
                 linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label=labels[0])

# Plot "Genéricos" F1-Scores
line3, = ax.plot(insultos_df[pd.isna(insultos_df['Balance'])].index, 
                 insultos_df[pd.isna(insultos_df['Balance'])][f'{labels[1]}_f1-score'], 
                 linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label=labels[1])

# Plot "Deseo de Dañar" F1-Scores
line4, = ax.plot(insultos_df[pd.isna(insultos_df['Balance'])].index, 
                 insultos_df[pd.isna(insultos_df['Balance'])][f'{labels[2]}_f1-score'], 
                 linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[2], alpha=0.5, label=labels[2])


# Append lines and labels only once
lines.extend([line1, line2, line3, line4, line5])
labels_legend.extend(['Weighted Avg. F1-Score', labels[0], labels[1], labels[2]])

ax.set_title('F1-Score for None', fontsize=20)
ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
ax.grid(True)

# Add vertical lines for specific indices
for date, index in zip(specific_dates, specific_indices):
    ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Add a vertical line for best window
for date, index in zip(specific_dates, specific_indices):
    if index in [688, 712]:
            ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
    else:
            ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Plot each balance in a separate subplot
for i, balance in enumerate(balances):
        ax = axes[i+1]

        # Plot Weighted F1-Scores
        ax.plot(insultos_df[insultos_df['Balance'] == balance].index, 
                insultos_df[insultos_df['Balance'] == balance]['weighted avg_f1-score'], 
                linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5)
        
        # Plot "Sexistas/misóginos" F1-Scores
        ax.plot(insultos_df[insultos_df['Balance'] == balance].index, 
                insultos_df[insultos_df['Balance'] == balance][f'{labels[0]}_f1-score'], 
                linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5)
        
        # Plot "Genéricos" F1-Scores
        ax.plot(insultos_df[insultos_df['Balance'] == balance].index, 
                insultos_df[insultos_df['Balance'] == balance][f'{labels[1]}_f1-score'], 
                linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5)
        
        # Plot "Deseo de Dañar" F1-Scores
        ax.plot(insultos_df[insultos_df['Balance'] == balance].index, 
                insultos_df[insultos_df['Balance'] == balance][f'{labels[2]}_f1-score'], 
                linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[2], alpha=0.5)


        ax.set_title(f'F1-Score for "{balance}"', fontsize=20)
        ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
        ax.grid(True)
        
        # Add vertical lines for specific indices
        for date, index in zip(specific_dates, specific_indices):
                ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

        # Add a vertical line for best window
        for date, index in zip(specific_dates, specific_indices):
                if index in [688, 712]:
                        ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
                else:
                        ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# # Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=5, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.95])
plt.show()


#### Embedding

In [None]:
# Define the list of embeddings
embeddings = ['fasttext', 'word2vec', 'bow', 'tfidf', 'roberta', 'bert-multi', 'beto-cased', 'beto-uncased', 'xlm-roberta-base', 'text-embedding-3-large']

# Create subplots
fig, axes = plt.subplots(len(embeddings), 1, figsize=(24, len(embeddings) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot each embedding in a separate subplot
for i, embedding in enumerate(embeddings):
    ax = axes[i]

    # Plot Weighted F1-Scores
    line1, = ax.plot(insultos_df[insultos_df['Embedding_Name'] == embedding].index, 
                     insultos_df[insultos_df['Embedding_Name'] == embedding]['weighted avg_f1-score'], 
                     linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')
    
    # Plot ""Sexistas/misóginos" F1-Scores" F1-Scores
    line2, = ax.plot(insultos_df[insultos_df['Embedding_Name'] == embedding].index, 
                     insultos_df[insultos_df['Embedding_Name'] == embedding][f'{labels[0]}_f1-score'], 
                     linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label=labels[0])
    
    # Plot "Genéricos" F1-Scores
    line3, = ax.plot(insultos_df[insultos_df['Embedding_Name'] == embedding].index, 
                     insultos_df[insultos_df['Embedding_Name'] == embedding][f'{labels[1]}_f1-score'], 
                     linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label=labels[1])
    
    # Plot "Deseo de Dañar" F1-Scores
    line4, = ax.plot(insultos_df[insultos_df['Embedding_Name'] == embedding].index, 
                     insultos_df[insultos_df['Embedding_Name'] == embedding][f'{labels[2]}_f1-score'], 
                     linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[2], alpha=0.5, label=labels[2])

    # Append lines and labels only once
    if i == 0:
        lines.extend([line1, line2, line3, line4, line5])
        labels_legend.extend(['Weighted Avg. F1-Score', labels[0], labels[1], labels[2]])
    
    ax.set_title(f'F1-Score for "{embedding}"', fontsize=20)
    ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
    ax.grid(True)
    
    # Add vertical lines for specific indices
    for date, index in zip(specific_dates, specific_indices):
        ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

    # Add a vertical line for best window
    for date, index in zip(specific_dates, specific_indices):
        if index in [688, 712]:
                ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
                ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=5, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.97])
plt.show()


#### Model

In [None]:
# Define the list of embeddings
models = ['random_forest', 'logistic_regression', 'xgboost', 'mlp', 'naive_bayes']

# Create subplots
fig, axes = plt.subplots(len(models), 1, figsize=(24, len(models) * 4), sharex=True)

# Initialize a list to store all lines for the legend
lines = []
labels_legend = []

# Plot each model in a separate subplot
for i, model in enumerate(models):
    ax = axes[i]

    # Plot Weighted F1-Scores
    line1, = ax.plot(insultos_df[insultos_df['Model'] == model].index, 
                     insultos_df[insultos_df['Model'] == model]['weighted avg_f1-score'], 
                     linewidth=3, linestyle='-', marker='o', markersize=7, color='black', alpha=0.5, label='Weighted Avg. F1-Score')
    
    # Plot "Sexistas/misóginos" F1-Scores
    line2, = ax.plot(insultos_df[insultos_df['Model'] == model].index, 
                     insultos_df[insultos_df['Model'] == model][f'{labels[0]}_f1-score'], 
                     linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[0], alpha=0.5, label=labels[0])
    
    # Plot "Genéricos" F1-Scores
    line3, = ax.plot(insultos_df[insultos_df['Model'] == model].index, 
                     insultos_df[insultos_df['Model'] == model][f'{labels[1]}_f1-score'], 
                     linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[1], alpha=0.5, label=labels[1])
    
    # Plot "Deseo de Dañar" F1-Scores
    line4, = ax.plot(insultos_df[insultos_df['Model'] == model].index, 
                     insultos_df[insultos_df['Model'] == model][f'{labels[2]}_f1-score'], 
                     linewidth=2, linestyle='-', marker='o', markersize=9, color=colors[2], alpha=0.5, label=labels[2])
    
    
    # Append lines and labels only once
    if i == 0:
        lines.extend([line1, line2, line3, line4, line5])
        labels_legend.extend(['Weighted Avg. F1-Score', labels[0], labels[1], labels[2]])
    
    ax.set_title(f'F1-Score for "{model}"', fontsize=20)
    ax.set_ylabel('Weighted Avg. F1-Score', fontsize=14)
    ax.grid(True)
    
    # Add vertical lines for specific indices
    for date, index in zip(specific_dates, specific_indices):
        ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

    # Add a vertical line for best window
    for date, index in zip(specific_dates, specific_indices):
        if index in [688, 712]:
                ax.axvline(x=index, color='#fd7b6e', linestyle='--', linewidth=2)
        else:
                ax.axvline(x=index, color='#1a2e49', linestyle='--', linewidth=2)

# Set x-ticks for the last subplot
axes[-1].set_xticks(ticks=specific_indices)
axes[-1].set_xticklabels([date.strftime('%d-%m-%Y') for date in specific_dates], rotation=90, fontsize=18)

# Set common labels
fig.text(0.5, 0.04, 'Date', ha='center', fontsize=18)

# Add a single legend for all subplots
fig.legend(lines, labels_legend, loc='upper center', bbox_to_anchor=(0.5, 0.99), ncol=5, fontsize=14)

# Adjust layout
plt.tight_layout(rect=[0.03, 0.03, 1, 0.95])
plt.show()

#### Performance (Time)

In [None]:
# Replace NaN values in the 'Balance' column with 'None'
insultos_df['Balance'].fillna('None', inplace=True)

# Set Seaborn style
sns.set(style="whitegrid")

# Calculate the upper limit for the y-axis dynamically
y_max = insultos_df['Time (s)'].mean() + 3 * insultos_df['Time (s)'].std()

# Create a figure with subplots using GridSpec
fig = plt.figure(figsize=(24, 16))
gs = GridSpec(2, 2, figure=fig, width_ratios=[1, 1], height_ratios=[1, 1])

# Plot for MODELS
ax1 = fig.add_subplot(gs[0, 0])
sns.barplot(ax=ax1, x='Model', y='Time (s)', data=insultos_df, palette='flare', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax1.set_title('Average Performance Time for Different Models with 95% CI', fontsize=20)
ax1.set_xlabel('Model', fontsize=16)
ax1.set_ylabel('Average Time (s)', fontsize=16)
ax1.tick_params(axis='x', labelsize=14)
ax1.tick_params(axis='y', labelsize=14)
ax1.set_ylim(0, y_max)
ax1.grid(axis='y', linestyle='--', alpha=0.7)

# Plot for BALANCE
ax2 = fig.add_subplot(gs[0, 1])
sns.barplot(ax=ax2, x='Balance', y='Time (s)', data=insultos_df, palette='flare', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax2.set_title('Average Performance Time for Different Balancing Techniques with 95% CI', fontsize=20)
ax2.set_xlabel('Balance', fontsize=16)
ax2.set_ylabel('Average Time (s)', fontsize=16)
ax2.tick_params(axis='x', labelsize=14)
ax2.tick_params(axis='y', labelsize=14)
ax2.set_ylim(0, y_max)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Plot for EMBEDDINGS
ax3 = fig.add_subplot(gs[1, :])
sns.barplot(ax=ax3, x='Embedding_Name', y='Time (s)', data=insultos_df, palette='flare', errorbar=('ci', 95), errwidth=1.5, capsize=0.2, alpha=0.7)
ax3.set_title('Average Performance Time for Different Embeddings with 95% CI', fontsize=20)
ax3.set_xlabel('Embedding', fontsize=16)
ax3.set_ylabel('Average Time (s)', fontsize=16)
ax3.tick_params(axis='x', labelsize=14, rotation=45)
ax3.tick_params(axis='y', labelsize=14)
ax3.set_ylim(0, y_max)
ax3.grid(axis='y', linestyle='--', alpha=0.7)

# Adjust layout to avoid overlap
plt.tight_layout()
plt.show()


In [None]:
# Calculate the correlation matrix
correlation_matrix = insultos_df[['weighted avg_f1-score', f'{labels[0]}_f1-score', f'{labels[1]}_f1-score', f'{labels[2]}_f1-score', 'Accuracy_Global', 'Std_Global','Time (s)']].corr()

# Set Seaborn style
sns.set(style="whitegrid")

# Create a heatmap for the correlation matrix
plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='flare', fmt='.2f', linewidths=0.5, linecolor='black')

# Set title and labels
plt.title('Correlation Matrix of Performance Metrics', fontsize=20)

# Show the plot
plt.tight_layout()
plt.show()


#### Confussion Matrix - Best Model

In [None]:
from sklearn.metrics import confusion_matrix

# Datos de la matriz de confusión para el modelo 'e7acae62-a0ce-4b64-b110-ff294df62157'
y_true = ['Sexistas/misóginos'] * 10 + ['Genéricos'] * 20 + ['Deseo de Dañar'] * 12
y_pred = [
    'Sexistas/misóginos'] * 8 + ['Genéricos'] * 2 + ['Deseo de Dañar'] * 0 + \
    ['Sexistas/misóginos'] * 0 + ['Genéricos'] * 16 + ['Deseo de Dañar'] * 4 + \
    ['Sexistas/misóginos'] * 0 + ['Genéricos'] * 5 + ['Deseo de Dañar'] * 7


# Crear la matriz de confusión
cm = confusion_matrix(y_true, y_pred, labels=labels)

# Plotear la matriz de confusión
plt.figure(figsize=(24, 22))

sns.heatmap(cm, annot=True, fmt='d', cmap='flare', xticklabels=labels, 
            yticklabels=labels, annot_kws={"size": 52}, cbar=False)

plt.xlabel('Predicted label', fontsize=30)
plt.ylabel('True label', fontsize=30)
plt.xticks(rotation=45, fontsize=46)
plt.yticks(rotation=0, fontsize=46)
# plt.title('Confusion Matrix for Best Model of ML Experiments')

plt.tight_layout()

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/Insultos/ML/Experiments in _Insultos_ best_model.pdf", format='pdf')

# Save the plot as a PNG file
plt.savefig("../../../IMAGES/Insultos/ML/Experiments in _Insultos_ best_model.png", format='png', transparent=True)

plt.show()



### Performance

In [None]:
# Mapping dictionary for task names
task_name_mapping = {
    "analisis_general": "Análisis General", 
    "contenido_negativo": "Contenido Negativo", 
    "insultos": "Insultos"
}

# Agrupar el DataFrame `df` por 'Type' y calcular la suma, promedio y número de experimentos del 'Time (s)'
df_grouped_sum = df.groupby(['Type'])['Time (s)'].sum().reset_index()
df_grouped_mean = df.groupby(['Type'])['Time (s)'].mean().reset_index()
df_grouped_count = df.groupby(['Type'])['Time (s)'].count().reset_index()

# Fusionar las agrupaciones en un solo DataFrame
df_grouped = df_grouped_sum.merge(df_grouped_mean, on='Type', suffixes=('_Total', '_Mean'))
df_grouped = df_grouped.merge(df_grouped_count, on='Type')

# Renombrar las columnas para mayor claridad
df_grouped.columns = ['Task', 'Total Time (s)', 'Mean Time (s)', 'Number of Experiments']

# Convert seconds to hours and round to 3 decimal places
df_grouped['Total Time (h)'] = (df_grouped['Total Time (s)'] / 3600).round(3)
df_grouped['Mean Time (h)'] = (df_grouped['Mean Time (s)'] / 3600).round(3)

# Apply the mapping to change the task names
df_grouped['Task'] = df_grouped['Task'].replace(task_name_mapping)

# Drop the original time in seconds columns if not needed
df_grouped = df_grouped[['Task', 'Total Time (h)', 'Mean Time (h)', 'Number of Experiments']]

df_grouped


In [None]:
# Set Seaborn style
sns.set(style="whitegrid")

# Maximum per limit for the y-axis between ML, DL and GenAI
y_max = 750

# Create a figure with subplots
fig, ax = plt.subplots(figsize=(24, 8))

# Plot for Total Time usando los colores personalizados
sns.barplot(ax=ax, x='Task', y='Number of Experiments', data=df_grouped, palette='bone', alpha=0.9)

# Set plot title and labels
ax.set_title('Total Number of Experiments per Classification Task in ML Experiments', fontsize=30)
ax.set_xlabel('', fontsize=18)
ax.set_ylabel('Num. of Experiments', fontsize=18)
ax.tick_params(axis='x', labelsize=18)
ax.tick_params(axis='y', labelsize=18)
ax.set_ylim(0, y_max)


# Adding grid for better readability
ax.grid(True)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Set Seaborn style
sns.set(style="whitegrid")

# Maximum per limit for the y-axis between ML, DL and GenAI
y_max = 75

# Define colors for specific tasks
task_colors = {
    'Análisis General': '#8E809E',
    'Contenido Negativo': '#6AA6D4',
    'Insultos': '#E38A83'
}

# Create a list of colors for the bars based on the task
bar_colors = [task_colors.get(task, '#CCCCCC') for task in df_grouped['Task']]  # Default color is light grey if not specified

# Create a figure with subplots
fig, ax = plt.subplots(figsize=(24, 8))

# Plot for Total Time using the customized colors
sns.barplot(ax=ax, x='Task', y='Total Time (h)', data=df_grouped, palette=bar_colors, alpha=0.9)

# Set plot title and labels
# ax.set_title('Total Performance Time per Classification Task in ML Experiments', fontsize=40)
# ax.set_xlabel('', fontsize=20)
ax.set_ylabel('Time (h)', fontsize=30)
ax.tick_params(axis='x', labelsize=34)
ax.tick_params(axis='y', labelsize=34)
ax.set_ylim(0, y_max)

# Add value labels on top of the bars in the same color as the bars
for bar, color in zip(ax.patches, bar_colors):
    yval = bar.get_height()
    ax.text(
        bar.get_x() + bar.get_width() / 2, yval + 1, f'{yval:.3f}',
        ha='center', va='bottom', fontsize=34, fontweight='bold',
        color=color  # Match text color with the bar color
    )

# Add a grid to improve readability
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Remove spines for a clean look
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# Adjust layout to avoid clipping of labels and titles
plt.tight_layout()

# Save the plot as a PDF file
plt.savefig("../../../IMAGES/ML_total_performance_time.pdf", format='pdf')

# Save the plot as a PNG file
plt.savefig("../../../IMAGES/ML_total_performance_time.png", format='png', transparent=True)

plt.show()

In [None]:
# Set Seaborn style
sns.set(style="whitegrid")

# Maximum per limit for the y-axis between ML, DL and GenAI
y_max = 6

# Create a figure with subplots
fig, ax = plt.subplots(figsize=(24, 8))

# Plot for Total Time usando los colores personalizados
sns.barplot(ax=ax, x='Task', y='Mean Time (h)', data=df_grouped, palette='bone', alpha=0.9)

# Set plot title and labels
ax.set_title('Mean Performance Time per Classification Task in ML Experiments', fontsize=30)
ax.set_xlabel('', fontsize=18)
ax.set_ylabel('Time (h)', fontsize=18)
ax.tick_params(axis='x', labelsize=18)
ax.tick_params(axis='y', labelsize=18)
ax.set_ylim(0, y_max)


# Adding grid for better readability
ax.grid(True)

# Show the plot
plt.tight_layout()
plt.show()