In [10]:
import sys
import json
import re
import numpy as np

from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    CountVectorizer,
    TfidfTransformer,
)
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

sys.path.append("..")

from utils.dataset_db import dynamo_db

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

"""
Not used in optimal sol'n - also makes it take hours to run
"""


class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


def get_words(text):
    text = text.lower()
    wordlist = text.split()
    clean_list = []
    for word in wordlist:
        # only get words (no digits)
        if not word.isdigit() and not re.match(r"[^\w]", word):
            clean_list.append(word)

    return " ".join(clean_list)


"""
Doesn't actually train, just generates a JSON in the format we want of training data
"""


def train(dataset, matching_dataset):
    next_index = 0
    themes = {}  # themes to indices
    targets = []  # indices of themes, parallel to text array
    text = []
    urls = []

    i = 0
    for project in dataset:
        m_themes = project["themes"]
        for matching_project in matching_dataset:
            if matching_project["url"] == project["url"]:
                m_themes = matching_project["themes"]
                break
        if len(project["text"]) != 0:
            words = get_words(project["text"])
            urls.append(project["url"])
            text.append(words)
            targets.append([])
            for theme in m_themes:
                if theme["id"] not in themes:
                    themes[theme["id"]] = next_index
                    next_index += 1
                targets[i].append(themes[theme["id"]])
        i += 1

    data = {}
    data["themes"] = themes
    data["targets"] = targets
    data["urls"] = urls
    data["text"] = text
    with open("trained1.json", "w") as output_file:  # trained.json
        json.dump(data, output_file)

    return themes


"""
Vectorizes both training and testing data, then classifies
"""


def classify(testing_data, testing_targets):
    with open("trained1.json", "r") as input_file:  # trained.json
        training_data = json.load(input_file)

    urls = []
    text = []
    targets = []
    i = 0
    for project in testing_data:
        if len(project["text"]) != 0:
            text.append(get_words(project["text"]))
            urls.append(project["url"])
            targets.append([])
            for theme in project["themes"]:
                if theme["id"] not in training_data["themes"]:
                    continue
                targets[i].append(training_data["themes"][theme["id"]])
        i += 1

    text_clf = Pipeline(
        [
            ("vect", CountVectorizer(ngram_range=(1, 2), max_df=0.6)),
            ("tfidf", TfidfTransformer()),
            # (
            #     "clf",
            #     SGDClassifier(random_state=42, max_iter=50, class_weight="balanced"),
            # ),
             ("clf", OneVsRestClassifier(SGDClassifier(random_state=42, loss="log")))
#            ("clf", DecisionTreeClassifier(random_state=42))
        ]
    )

    y = training_data["targets"]
    y = MultiLabelBinarizer().fit_transform(y)
    
    test_words = text
    text_clf.fit(training_data["text"], y)
    predicted = text_clf.predict_proba(test_words)
    log_predicted = text_clf.predict(test_words)

#     with open("predictions1.json", "w") as output_file:  # predictions.json
#         json.dump(
#             predicted.tolist(),
#             output_file,
#             separators=(",", ":"),
#             sort_keys=True,
#             indent=4,
#         )

    return predicted, log_predicted


def get_targets(data, data_with_targets, themes):
    targets = []
    i = 0
    for project in data:
        m_themes = project["themes"]
        for matching_project in data_with_targets:
            if matching_project["url"] == project["url"]:
                m_themes = matching_project["themes"]
                break
        targets.append([])
        for theme in m_themes:
            if theme["id"] not in themes:
                targets[i].append(-1)
            targets[i].append(themes[theme["id"]])
        i += 1

    return targets

In [None]:
dataset = dynamo_db.get_dataset("organizations_text")
print(len(dataset))
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
print(len(train_data), len(test_data))

In [None]:
matching_dataset = dynamo_db.get_dataset("organizations")
print(len(matching_dataset))

In [11]:
themes = train(train_data, matching_dataset)
print(themes)

{'ecdev': 0, 'edu': 1, 'finance': 2, 'gender': 3, 'rights': 4, 'climate': 5, 'children': 6, 'env': 7, 'health': 8, 'human': 9, 'tech': 10, 'animals': 11, 'disaster': 12, 'sport': 13, 'art': 14, 'democ': 15, 'hunger': 16, 'lgbtq': 17}


In [12]:
testing_targets = get_targets(test_data, matching_dataset, themes)
print(testing_targets)

[[6, 1, 9], [6, 0, 1, 3, 8, 9, 4, 10, 16], [3], [6, 0, 1, 8, 13], [0, 1, 2, 3], [6, 5, 15, 12, 0, 1, 7, 2, 3, 8, 9, 4, 10], [6, 1, 3, 9, 4, 10], [6, 5, 0, 1, 7, 3, 8, 4], [6, 1], [6, 5, 12, 0, 1, 7, 3, 8, 9, 4, 16, 14], [3], [6, 1, 3, 9, 4], [8], [11], [5, 0, 1, 7, 3, 10], [6, 5, 0, 1, 7, 3, 8, 9, 4], [6, 1], [6, 1, 3, 13], [6, 0, 1, 3, 8, 4], [1], [6, 1, 3, 9, 4, 16], [6, 1, 2, 3, 4], [6, 12, 1, 3, 9, 16], [6, 15, 0, 1, 7, 3, 8, 4, 10], [6, 0, 1, 7, 3, 8, 9, 16], [6, 5, 1, 7, 10], [6, 5, 12, 0, 1, 7, 3, 8, 9, 10], [0, 1, 8, 9], [6, 1, 8], [6, 0, 1, 2, 3, 8, 9, 4, 13, 14], [6, 0, 7, 3, 8, 9, 10], [6, 1, 3, 8, 9, 13], [1], [6, 1, 2, 3, 8, 4], [0, 1, 3], [0, 1, 7, 2, 3, 8, 9, 4], [6, 0, 1, 2, 3], [11, 6, 5, 15, 0, 1, 7, 2, 3, 8, 4, 10, 14], [6, 12, 0, 1, 7, 3, 8, 9], [1, 2], [6, 1], [6, 1, 3, 8, 4, 13], [6, 0, 1, 3, 8, 16], [6, 1, 4, 13], [5, 0, 1, 7, 3, 8, 4, 10], [6, 0, 1, 2, 3, 8, 13], [6, 0, 1, 2, 3, 8, 4, 16], [0, 7, 4], [12, 0, 1, 3, 14], [12, 1], [11, 0, 1, 7], [6, 1, 3, 10], [6, 

In [13]:
probabilities, predictions = classify(test_data, testing_targets)
print(probabilities)
print(predictions)
print(len(testing_targets))
# print(set(testing_targets))
# print(np.mean(predictions == testing_targets))
# print(metrics.confusion_matrix(testing_targets, predictions))



[[0.24138187 0.93163593 0.05905105 ... 0.11105406 0.06367558 0.00766928]
 [0.65527132 0.76905202 0.20598563 ... 0.08486023 0.09889555 0.00608746]
 [0.29054517 0.65003444 0.08946829 ... 0.03780805 0.11125251 0.00717774]
 ...
 [0.32604578 0.25301279 0.06052227 ... 0.03427857 0.08634965 0.00870725]
 [0.79367794 0.79417985 0.24737021 ... 0.21563612 0.13775254 0.00531084]
 [0.56765046 0.75959952 0.19742883 ... 0.1176932  0.13259303 0.00546089]]
[[0 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]]
684


In [14]:
print(probabilities[0])
print(predictions[0])
testing_targets[0].sort()
testing_targets[0]

[0.24138187 0.93163593 0.05905105 0.4260415  0.21547512 0.06678046
 0.67277869 0.13913992 0.34276627 0.19255453 0.18798987 0.04413205
 0.07105612 0.08532089 0.07787218 0.11105406 0.06367558 0.00766928]
[0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


[1, 6, 9]

In [15]:
print(probabilities[1])
print(predictions[1])
testing_targets[1].sort()
testing_targets[1]

[0.65527132 0.76905202 0.20598563 0.83068604 0.26963452 0.16457732
 0.55105735 0.45727434 0.59985664 0.2202207  0.119725   0.04965593
 0.14895674 0.04327176 0.04688075 0.08486023 0.09889555 0.00608746]
[1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0]


[0, 1, 3, 4, 6, 8, 9, 10, 16]

In [21]:
for theme_name,theme_number in themes.items():
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for i in range(len(predictions)):
        if predictions[i][theme_number] == 1 and theme_number in testing_targets[i]:
            tp += 1
        if predictions[i][theme_number] == 1 and theme_number not in testing_targets[i]:
            fp += 1
        if predictions[i][theme_number] == 0 and theme_number in testing_targets[i]:
            fn += 1
        if predictions[i][theme_number] == 0 and theme_number not in testing_targets[i]:
            tn += 1
    
    accuracy = (tp + tn) / len(predictions)
    precision = tp / (tp+fp) if (tp+fp) != 0 else 0
    recall = tp / (tp+fn) if (tp+fn) != 0 else 0
    f1 = 2*((precision*recall) / (precision+recall)) if (precision + recall) != 0 else 0
    
    print(theme_name + ":")
    print("Accuracy:",accuracy)
    print("Precision:",precision)
    print("Recall:",recall)
    print("F1:",f1)
    print("    T  F")
    print("T  "+str(tp)+", "+str(fp))
    print("F  "+str(fn)+", "+str(tn))
    print()

ecdev:
Accuracy: 0.7002923976608187
Precision: 0.6813186813186813
Recall: 0.4575645756457565
F1: 0.5474613686534217
    T  F
T  124, 58
F  147, 355

edu:
Accuracy: 0.7353801169590644
Precision: 0.7342767295597484
Recall: 0.9749478079331941
F1: 0.8376681614349777
    T  F
T  467, 169
F  12, 36

finance:
Accuracy: 0.847953216374269
Precision: 0.5
Recall: 0.009615384615384616
F1: 0.01886792452830189
    T  F
T  1, 1
F  103, 579

gender:
Accuracy: 0.6681286549707602
Precision: 0.6607538802660754
Recall: 0.8010752688172043
F1: 0.7241798298906439
    T  F
T  298, 153
F  74, 159

rights:
Accuracy: 0.7207602339181286
Precision: 0.6923076923076923
Recall: 0.27876106194690264
F1: 0.39747634069400634
    T  F
T  63, 28
F  163, 430

climate:
Accuracy: 0.8801169590643275
Precision: 0.9
Recall: 0.1
F1: 0.18000000000000002
    T  F
T  9, 1
F  81, 593

children:
Accuracy: 0.6476608187134503
Precision: 0.6481481481481481
Recall: 0.8728179551122195
F1: 0.7438894792773645
    T  F
T  350, 190
F  51, 93



In [22]:
# for every document, calculate the matrix, then f1 score
f1_scores = []
accuracies = []
for i in range(len(predictions)):
    fp = 0
    fn = 0
    tp = 0
    tn = 0
    for j in range(len(predictions[i])):
        if predictions[i][j] == 1:
            if j in testing_targets[i]:
                tp += 1
            else:
                fp += 1
        else:
            if j in testing_targets[i]:
                fn += 1
            else:
                tn += 1
    precision = 0 if tp+fp == 0 else tp/(tp+fp)
    recall = 0 if tp+fn == 0 else tp/(tp+fn)
    f1 = 0 if precision+recall == 0 else 2*(precision*recall)/(precision+recall)
    f1_scores.append(f1)
    accuracies.append((tp+tn)/len(predictions[i]))

print(np.mean(np.array(f1_scores)))
print(f1_scores)
print(np.mean(np.array(accuracies)))
print(accuracies)

0.5767068503605232
[0.8, 0.7142857142857143, 0.4, 0.6666666666666665, 0.6666666666666665, 0.631578947368421, 0.6, 0.7692307692307693, 0.8, 0.7368421052631579, 0.5, 0.6666666666666665, 0.6666666666666666, 0, 0.6666666666666666, 0.6153846153846153, 0.5714285714285715, 0.6666666666666666, 0.8, 0.33333333333333337, 0.5454545454545454, 0.6666666666666665, 0.4444444444444444, 0.5, 0.6666666666666666, 0.5714285714285715, 0.7499999999999999, 0.6666666666666665, 0.8, 0.33333333333333337, 0.4, 0.8, 0.4, 0.8, 0.8571428571428571, 0.36363636363636365, 0.8000000000000002, 0.7000000000000001, 0.7692307692307693, 0.28571428571428575, 1.0, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6153846153846154, 0.7692307692307692, 0.7692307692307693, 0.8, 0.5714285714285715, 0.4, 0.5, 0.4, 0.8571428571428571, 0.4, 0.888888888888889, 0.5, 1.0, 1.0, 0, 0.8571428571428571, 0.7499999999999999, 0.888888888888889, 0.2857142857142857, 0.28571428571428575, 0.75, 0.4444444444444444, 0.6666666666666666, 0