# Common sense baselines Quiz!

Idea and functionality taken from the [einops](https://github.com/arogozhnikov/einops/blob/main/docs/2-einops-for-deep-learning.ipynb) tutorials.

In [None]:
from IPython.display import display_html

_style_inline = """<style>
.conv-answer {
    color: transparent;
    padding: 15px:
    background-color: lightgray;
}
.conv-answer:hover { color: fuchsia; }
</style>
"""


def guess(x):
    is_classification, is_balanced, data = x
    if is_classification:
        if is_balanced:
            n_classes = data
            answ = f"<pre> Answer:<br><br><span class='conv-answer'>A random guess: 1/{n_classes} (= {1/n_classes:.3f}, or a {1/n_classes*100:.0f}% accuracy baseline)</span> (hover to see)</pre>"
        else:
            data, most_frequent_class, n_most_frequent_class = data
            tot = sum(data)
            answ = f"<pre> Answer:<br><br><span class='conv-answer'>Predicting class {most_frequent_class} (with the most samples) yields {n_most_frequent_class}/{tot} (= {n_most_frequent_class/tot:.3f}, or a {n_most_frequent_class/tot*100:.0f}% accuracy baseline).\nThis is better than a random guess: 1/{len(data)} (= {1/len(data):.3f}, or a {1/len(data)*100:.0f}% accuracy baseline)</span> <br><br> (hover to see)</pre>"
    else:
        answ = f"<pre> Answer:<br><br><span class='conv-answer'>The baseline depends on the problem, but often we use the *mean* of the data.</span> <br><br> (hover to see)</pre>"
    display_html(_style_inline + answ,raw=True,)

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def random_dataset():
    is_classification = bool(random.randint(0,1))
    if is_classification:
        print("We are faced with a classification problem.")
        n_classes = random.choice(range(3, 20))
        balanced = bool(random.randint(0,1))
        if balanced:
            samples_per_class = np.random.randint(100, 1000)
            print(f"The dataset is balanced, with {n_classes} classes, and {samples_per_class} samples per class.")
            print("What is the common sense baseline?")
            print()
            return True, True, n_classes
        else:
            samples_per_class = [np.random.randint(100, 1000) for _ in range(n_classes)]
            argmax = np.argmax(samples_per_class)
            print(f"The dataset is imbalanced, with the following sample distribution:")
            
            bars = plt.bar(range(n_classes), samples_per_class)
            
            # Add the number of samples on top of each bar
            for i, bar in enumerate(bars):
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width() / 2.0, height, f'{height}', ha='center', va='bottom')
    
            # Set the ticks to be at the center of each bar
            plt.xticks(ticks=range(n_classes), labels=[str(i) for i in range(n_classes)])
    
            plt.xlabel("Classes")
            plt.ylabel("Number of Samples")
            plt.title(f"Dataset (total samples: {sum(samples_per_class)})")
            plt.show()
            print("What is the common sense baseline?")
            print()
            return (True, False, (samples_per_class, argmax, samples_per_class[argmax]))
    else:
        print("We are faced with a regression problem.")
        print("What is the common sense baseline?")
        print()
        return False, None, None

## The Quiz

Rerun for a new set-up, hover to see the answer.

Remember, it's not the actual calculation that's important (use Python for that), but an understanding of the choice of the baseline!

In [None]:
guess(random_dataset())