In [None]:
#pip install sdgclassification-benchmark

In [1]:
from openai import OpenAI
import numpy as np
from numpy.linalg import norm
import pickle
from tqdm import tqdm

# This is a very simple wrapper to store text embeddings
class VectorDB:
    def __init__(self, key, file_name = False):
        self.client = OpenAI(api_key=key)

        if file_name:
            self.db = pickle.load(open(file_name + '.p', 'rb'))
        else:
            self.db = {}

    def load_vectors(self, texts):
        for text in tqdm(texts):
            if text in self.db: continue
            response = self.client.embeddings.create(input=text, model="text-embedding-3-small")
            self.db[text] = response.data[0].embedding

    def get_sim(self, text1, text2):
        return self.cossim(self.db[text1], self.db[text2])

    def cossim(self, a, b):
        return np.dot(a, b)/(norm(a)*norm(b))

    def save(self, file_name):
        pickle.dump(self.db, open(file_name + '.p', 'wb'))

In [2]:
from sdgclassification.benchmark import Benchmark

# Loading SDG targets, cleaned from dates as dates seem to be irrelavant to classification task
targets = pickle.load(open('targets.p', 'rb'))

# Loading text embedding for targets and texts from benchamrk
DB = VectorDB(False, 'db_vectors')

The main issue with using similarity between text embeddings is that there is no natural threshold.

Should a text be considered related to an SDG if the similarity is above 0.5? 0.1? 0.9?

The thresholds below are selected to maximize accuracy, so they probably overestimate it.

A proper approach would be to use part of the dataset to determine cutoffs and then compute accuracy on another. However, this particular benchmark is too small for that\. Alternatively, the entire precision/recall curves could be compared between two models.

In [3]:
thresholds = [0.4244574244574244, 0.35448535448535445, 0.3784393784393784, 0.3233653233653233, 0.3743923743923744, 0.403058403058403, 0.39357439357439356, 0.36266536266536264, 0.39408739408739407, 0.36786736786736784, 0.41178041178041175, 0.4087314087314087, 0.4266624266624266]

To predict SDG, we first determine the target with the highest similarity.

The idea is that if a text relates to one target, it is sufficient to be related to SDG.

One could try using different thresholds for different targets, but that increases the risk of overfitting.

In [5]:
def predict_sdgs(text):
    result = []
    for sdg in range(1, 14):
        sim = np.max([DB.get_sim(text, target) for target in targets[sdg]])

        if sim > thresholds[sdg - 1]:
            result.append(sdg)
    return result

In [6]:
benchmark = Benchmark(predict_sdgs)
benchmark.run()

################################################################################
Running benchmark
Results:
+---------+------+----------------+-----------------+--------------+------------+------+------+------+------+
| SDG     |    n |   Accuracy (%) |   Precision (%) |   Recall (%) |   F1 Score |   TP |   FP |   TN |   FN |
|---------+------+----------------+-----------------+--------------+------------+------+------+------+------|
| Average | 74.2 |           87.5 |            85.1 |         90.5 |       0.88 |   33 |  5.8 | 31.8 |  3.5 |
| 1       |   77 |           92.2 |            86.2 |         92.6 |       0.89 |   25 |    4 |   46 |    2 |
| 2       |   69 |           85.5 |            85.7 |         93.3 |       0.89 |   42 |    7 |   17 |    3 |
| 3       |   76 |           89.5 |            85.7 |         85.7 |       0.86 |   24 |    4 |   44 |    4 |
| 4       |   82 |           86.6 |            82.0 |         95.3 |       0.88 |   41 |    9 |   30 |    2 |
| 5       | 