In [2]:
import json
from sklearn.model_selection import train_test_split

from SGDClassifier import (
    NGO_SGDClassifier,
    set_up_training_data,
    save_classifier,
    load_classifier,
)

In [3]:
with open("data.json", "r") as in_file:
    dataset = json.load(in_file)
    train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
    formatted_train_data = set_up_training_data(train_data, "formatted_train.json")

In [5]:
# Create the classifier from the sample training.json
classifier = NGO_SGDClassifier()
classifier.fit(formatted_train_data)

classifier.get_target_map()



{'Animals': 11,
 'Arts and Culture': 14,
 'Children': 6,
 'Climate Change': 5,
 'Democracy and Governance': 15,
 'Disaster Recovery': 12,
 'Economic Development': 0,
 'Education': 1,
 'Environment': 7,
 'Health': 8,
 'Human Rights': 4,
 'Humanitarian Assistance': 9,
 'Hunger': 16,
 'LGBTQAI+': 17,
 'Microfinance': 2,
 'Sport': 13,
 'Technology': 10,
 'Women and Girls': 3}

In [6]:
# We can save and reload the classifier that is already fitted to the data
save_classifier(classifier, "SGDClassifier.obj")
classifier2 = load_classifier("SGDClassifier.obj")

In [7]:
# Predict the themes
probabilities, predictions = classifier2.predict(test_data)
print(probabilities)
print(predictions)

[[0.24228852 0.93219118 0.05950462 ... 0.11416817 0.0642635  0.00777012]
 [0.65180474 0.76397265 0.20796356 ... 0.08220288 0.09921484 0.00577589]
 [0.29016208 0.65464337 0.08853504 ... 0.03731894 0.10973523 0.00698353]
 ...
 [0.32458861 0.24691815 0.06184393 ... 0.0353329  0.08893793 0.00889758]
 [0.79330378 0.7981592  0.24537278 ... 0.220265   0.12993512 0.00504916]
 [0.57190135 0.76971584 0.19705805 ... 0.11781661 0.13216392 0.00541462]]
[[0 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]]


In [8]:
# Get the f1 scores
mean_document_f1_score, category_f1_scores = classifier2.get_f1_scores()
print("The mean f1 score among documents is:", mean_document_f1_score)
print(category_f1_scores)

The mean f1 score among documents is: 0.5762209499520222
{'Economic Development': 0.545054945054945, 'Education': 0.8351254480286738, 'Microfinance': 0.019047619047619046, 'Women and Girls': 0.7230392156862745, 'Human Rights': 0.40125391849529785, 'Climate Change': 0.18000000000000002, 'Children': 0.7444326617179214, 'Environment': 0.49808429118773945, 'Health': 0.7313997477931905, 'Humanitarian Assistance': 0.08695652173913045, 'Technology': 0.04040404040404041, 'Animals': 0.3793103448275862, 'Disaster Recovery': 0.045454545454545456, 'Sport': 0.03389830508474576, 'Arts and Culture': 0, 'Democracy and Governance': 0.04347826086956522, 'Hunger': 0.048192771084337345, 'LGBTQAI+': 0}
