In [70]:
import sys
import json
import re
import numpy as np

from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    CountVectorizer,
    TfidfTransformer,
)
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

sys.path.append("..")

from utils.dataset_db import dynamo_db

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

"""
Not used in optimal sol'n - also makes it take hours to run
"""


class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


def get_words(text):
    text = text.lower()
    wordlist = text.split()
    clean_list = []
    for word in wordlist:
        # only get words (no digits)
        if not word.isdigit() and not re.match(r"[^\w]", word):
            clean_list.append(word)

    return " ".join(clean_list)


"""
Doesn't actually train, just generates a JSON in the format we want of training data
"""


def train(dataset, matching_dataset):
    next_index = 0
    themes = {}  # themes to indices
    targets = []  # indices of themes, parallel to text array
    text = []
    urls = []

    i = 0
    for project in dataset:
        m_themes = project["themes"]
        for matching_project in matching_dataset:
            if matching_project["url"] == project["url"]:
                m_themes = matching_project["themes"]
                break
        if len(project["text"]) != 0:
            words = get_words(project["text"])
            urls.append(project["url"])
            text.append(words)
            targets.append([])
            for theme in m_themes:
                if theme["id"] not in themes:
                    themes[theme["id"]] = next_index
                    next_index += 1
                targets[i].append(themes[theme["id"]])
        i += 1

    data = {}
    data["themes"] = themes
    data["targets"] = targets
    data["urls"] = urls
    data["text"] = text
    with open("trained1.json", "w") as output_file:  # trained.json
        json.dump(data, output_file)

    return themes


"""
Vectorizes both training and testing data, then classifies
"""


def classify(testing_data, testing_targets):
    with open("trained1.json", "r") as input_file:  # trained.json
        training_data = json.load(input_file)

    urls = []
    text = []
    targets = []
    i = 0
    for project in testing_data:
        if len(project["text"]) != 0:
            text.append(get_words(project["text"]))
            urls.append(project["url"])
            targets.append([])
            for theme in project["themes"]:
                if theme["id"] not in training_data["themes"]:
                    continue
                targets[i].append(training_data["themes"][theme["id"]])
        i += 1

    text_clf = Pipeline(
        [
            ("vect", CountVectorizer(ngram_range=(1, 2), max_df=0.6)),
            ("tfidf", TfidfTransformer()),
            # (
            #     "clf",
            #     SGDClassifier(random_state=42, max_iter=50, class_weight="balanced"),
            # ),
#             ("clf", OneVsRestClassifier(SGDClassifier(random_state=42, loss="log")))
            ("clf", DecisionTreeClassifier(random_state=42))
        ]
    )

    y = training_data["targets"]
    y = MultiLabelBinarizer().fit_transform(y)
    
    test_words = text
    text_clf.fit(training_data["text"], y)
    predicted = text_clf.predict_proba(test_words)
    log_predicted = text_clf.predict(test_words)

#     with open("predictions1.json", "w") as output_file:  # predictions.json
#         json.dump(
#             predicted.tolist(),
#             output_file,
#             separators=(",", ":"),
#             sort_keys=True,
#             indent=4,
#         )

    return predicted, log_predicted


def get_targets(data, data_with_targets, themes):
    targets = []
    i = 0
    for project in data:
        m_themes = project["themes"]
        for matching_project in data_with_targets:
            if matching_project["url"] == project["url"]:
                m_themes = matching_project["themes"]
                break
        targets.append([])
        for theme in m_themes:
            if theme["id"] not in themes:
                targets[i].append(-1)
            targets[i].append(themes[theme["id"]])
        i += 1

    return targets

In [9]:
dataset = dynamo_db.get_dataset("organizations_text")
print(len(dataset))
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
print(len(train_data), len(test_data))

3419
2735 684


In [14]:
matching_dataset = dynamo_db.get_dataset("organizations")
print(len(matching_dataset))

5692


In [28]:
themes = train(train_data, matching_dataset)
print(themes)

{'ecdev': 0, 'edu': 1, 'finance': 2, 'gender': 3, 'rights': 4, 'climate': 5, 'children': 6, 'env': 7, 'health': 8, 'human': 9, 'tech': 10, 'animals': 11, 'disaster': 12, 'sport': 13, 'art': 14, 'democ': 15, 'hunger': 16, 'lgbtq': 17}


In [37]:
testing_targets = get_targets(test_data, matching_dataset, themes)
print(testing_targets)

[[6, 1, 9], [6, 0, 1, 3, 8, 9, 4, 10, 16], [3], [6, 0, 1, 8, 13], [0, 1, 2, 3], [6, 5, 15, 12, 0, 1, 7, 2, 3, 8, 9, 4, 10], [6, 1, 3, 9, 4, 10], [6, 5, 0, 1, 7, 3, 8, 4], [6, 1], [6, 5, 12, 0, 1, 7, 3, 8, 9, 4, 16, 14], [3], [6, 1, 3, 9, 4], [8], [11], [5, 0, 1, 7, 3, 10], [6, 5, 0, 1, 7, 3, 8, 9, 4], [6, 1], [6, 1, 3, 13], [6, 0, 1, 3, 8, 4], [1], [6, 1, 3, 9, 4, 16], [6, 1, 2, 3, 4], [6, 12, 1, 3, 9, 16], [6, 15, 0, 1, 7, 3, 8, 4, 10], [6, 0, 1, 7, 3, 8, 9, 16], [6, 5, 1, 7, 10], [6, 5, 12, 0, 1, 7, 3, 8, 9, 10], [0, 1, 8, 9], [6, 1, 8], [6, 0, 1, 2, 3, 8, 9, 4, 13, 14], [6, 0, 7, 3, 8, 9, 10], [6, 1, 3, 8, 9, 13], [1], [6, 1, 2, 3, 8, 4], [0, 1, 3], [0, 1, 7, 2, 3, 8, 9, 4], [6, 0, 1, 2, 3], [11, 6, 5, 15, 0, 1, 7, 2, 3, 8, 4, 10, 14], [6, 12, 0, 1, 7, 3, 8, 9], [1, 2], [6, 1], [6, 1, 3, 8, 4, 13], [6, 0, 1, 3, 8, 16], [6, 1, 4, 13], [5, 0, 1, 7, 3, 8, 4, 10], [6, 0, 1, 2, 3, 8, 13], [6, 0, 1, 2, 3, 8, 4, 16], [0, 7, 4], [12, 0, 1, 3, 14], [12, 1], [11, 0, 1, 7], [6, 1, 3, 10], [6, 

In [None]:
probabilities, predictions = classify(test_data, testing_targets)
print(probabilities)
print(predictions)
print(len(testing_targets))
# print(set(testing_targets))
# print(np.mean(predictions == testing_targets))
# print(metrics.confusion_matrix(testing_targets, predictions))

In [69]:
print(probabilities[0])
print(predictions[0])
testing_targets[0].sort()
testing_targets[0]

[0.24138187 0.93163593 0.05905105 0.4260415  0.21547512 0.06678046
 0.67277869 0.13913992 0.34276627 0.19255453 0.18798987 0.04413205
 0.07105612 0.08532089 0.07787218 0.11105406 0.06367558 0.00766928]
[0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


[1, 6, 9]

In [68]:
print(probabilities[1])
print(predictions[1])
testing_targets[1].sort()
testing_targets[1]

[0.65527132 0.76905202 0.20598563 0.83068604 0.26963452 0.16457732
 0.55105735 0.45727434 0.59985664 0.2202207  0.119725   0.04965593
 0.14895674 0.04327176 0.04688075 0.08486023 0.09889555 0.00608746]
[1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0]


[0, 1, 3, 4, 6, 8, 9, 10, 16]