## Generating Baseline Predictions
This file generates a baseline prediction for the testing data using the HuggingFace model `Birds-Classifier-EfficientNetB2`. It uses the module `fuzzywuzzy` to compute similarity scores between predicted species and species in our dataset to link the correct label to each prediction. The output is stored in `baseline.csv` in the `/predictions` directory.

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline
from fuzzywuzzy import fuzz
import pickle

In [None]:
label_translations = np.load("../src/class_names.npy", allow_pickle=True).item()

In [None]:
with open('../pickles/test_images.pkl', 'rb') as f:
    images = pickle.load(f)
f.close()

In [None]:
pipe = pipeline("image-classification", model="dennisjooo/Birds-Classifier-EfficientNetB2")

In [None]:
predicted = []
print('Predicting classes...')
random_guess_counter = 0
for i, img in enumerate(images):
    if round((i / len(images)) * 100) % 1 == 0:
        print(f'{round((i / len(images)) * 100)}% complete{'!' if i+1 == len(images) else ''}', end="\r")
    feasible_labels = {}
    result = pipe(img)[0]
    reformatted = result['label'].lower().title().replace(' ', '_')
    for key in label_translations.keys():
        score = fuzz.partial_ratio(key.lower().title(), reformatted)
        if score > 0: # threshold currently set to zero for maximised accuracy (despite accuracy being low overall)
            label = key
            feasible_labels[key] = score
        else:
            pass
    try:
        most_likely_label = max(feasible_labels, key=feasible_labels.get)
    except:
        # adding an exception for if the species identified by the baseline is not in the labels, in which case we predict randomly
        # because the threshold is currently set to zero this is redundant
        most_likely_label = list(label_translations.keys())[list(label_translations.values()).index(np.random.randint(1, 200))][:]
        random_guess_counter += 1
    numerical_label = label_translations[most_likely_label]
    predicted.append(numerical_label)
print(f'Number of random guesses: {random_guess_counter}')

In [None]:
output_df = pd.DataFrame({'id':[i+1 for i in range(len(predicted))], 'label':predicted})

In [None]:
output_df.to_csv('../predictions/baseline.csv', index=False)