<a href="https://colab.research.google.com/github/kobrue02/evaluating-llm-generated-nlu-data/blob/main/bin/notebooks/train_eval_nlu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/kobrue02/evaluating-llm-generated-nlu-data/
%cd evaluating-llm-generated-nlu-data

In [2]:
from bin.framework.nlu_model import IntentClassifier
from bin.utils.methods import *
from sklearn.neural_network import MLPClassifier

In [25]:
model = IntentClassifier(model=MLPClassifier())
datasets = [
    "zero_shot_simple_data",
    "one_shot_simple_data",
    "few_shot_simple_data",
    "chain_of_thought_simple_data",
    "persona_based_prompt_s1_data"
]

In [None]:
reports = {}
for fname in datasets:
    df = load_df(fname)
    train_df, test_df = model.split_dataset(df)
    model.fit(train_df)
    report = model.evaluate(test_df)
    reports[fname] = report
    model.reset()

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
def stacked_bar_chart(reports):
    # Create a dictionary to store the weighted averages
    weighted_averages = {}

    for dataset_name, report in reports.items():
        weighted_averages[dataset_name] = {}
        for intent, metrics in report.items():
            if intent not in {'accuracy', 'macro avg', 'weighted avg'}:
                weighted_averages[dataset_name][intent] = {
                    'precision': metrics['precision'],
                    'recall': metrics['recall'],
                    'f1-score': metrics['f1-score']
                }

    # Convert the weighted_averages dictionary to a Pandas DataFrame
    df_list = []
    for dataset_name, intents in weighted_averages.items():
        for intent, metrics in intents.items():
            df_list.append({
                'dataset': dataset_name,
                'intent': intent,
                'precision': metrics['precision'],
                'recall': metrics['recall'],
                'f1-score': metrics['f1-score']
            })
    df = pd.DataFrame(df_list)

    # Generate individual plots
    for dataset_name in weighted_averages.keys():
        dataset_df = df[df['dataset'] == dataset_name]
        dataset_df = dataset_df.set_index('intent')[['precision', 'recall', 'f1-score']]

        # Create a new figure for each dataset
        plt.figure(figsize=(12, 6))
        dataset_df.plot(kind='bar', stacked=True, colormap="Set3")

        plt.title(f'Metrics per intent for {dataset_name}')
        plt.ylabel('Score')
        plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

        # Add a horizontal line with the weighted average F1 score
        weighted_avg_f1 = df[df['dataset'] == dataset_name]['f1-score'].mean()
        plt.axhline(y=weighted_avg_f1, color='b', linestyle='-')
        plt.xticks([])

        # Save the figure
        plt.savefig(f'output/{dataset_name}.png', bbox_inches='tight')
        plt.close()  # Close the figure to free memory

    print("Plots saved successfully!")

In [29]:
stacked_bar_chart(reports)

Plots saved successfully!


<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [9]:
def heatmap(reports):
  for dataset_name, report in reports.items():
    # Convert report to DataFrame
    df_report = pd.DataFrame(report).transpose()
    # Plot heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(df_report.iloc[:-1, :3], annot=True, cmap='Blues', fmt='.2f')
    plt.title(f'{dataset_name.title()} Classification Report Heatmap')
    plt.show()

In [None]:
heatmap(reports)

In [None]:
from bin.utils.read_datasets import read_sipgate_dataset
from bin.utils.clean_sipgate_dataset import clean_sipgate_dataset

golden_df = clean_sipgate_dataset(read_sipgate_dataset())
# subset with 25 queries per intent
golden_df = golden_df.groupby('intent').sample(n=25, random_state=42)
golden_df.head()

In [None]:
model = IntentClassifier(model=MLPClassifier())
train_df, test_df = model.split_dataset(df)
model.fit(train_df)
report = model.classification_report(test_df)
print(report)

In [34]:
report_dict = model.evaluate(test_df)
stacked_bar_chart({
    "sipgate": report_dict
})
plt.show()

Plots saved successfully!


<Figure size 1200x600 with 0 Axes>