In [8]:
from io import open
from conllu import parse_incr

data_file = open("../data/processed/dev_sexism_dataset_conllu.conllu", "r")

X = []
y = []
for tokenlist in parse_incr(data_file):
    y.append(tokenlist.metadata["label_sexist"])
    tokens = [token["form"] for token in tokenlist]
    X.append(tokens)

In [12]:
max_len = 0
for x in X:
    if len(x) > max_len:
        max_len = len(x)

max_len

49

In [10]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X, y)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2000,) + inhomogeneous part.

In [11]:
from collections import Counter, defaultdict

class SimpleNBClassifier():
    def __init__(self):
        self.word_count = Counter()
        self.count_by_class = defaultdict(Counter)
        self.labels = set()

    def count_words(self, docs):
        for words, label in docs:
            self.labels.add(label)
            for word in set(words):
                self.word_count[word] += 1
                self.count_by_class[label][word] += 1

    def calculate_weights(self):
        self.weights = {
            word: {
                label: log((self.count_by_class[label][word] + 1) / (count + len(self.word_count)))
                for label in self.labels}
            for word, count in self.word_count.items()}

    def get_doc_weights(self, doc):
        return {
            label: sum(
                self.weights[word][label] if word in self.weights else log(1)
                for word in doc)
            for label in self.labels}

    def predict_label(self, doc):
        doc_weights = self.get_doc_weights(doc)
        return sorted(doc_weights.items(), key=lambda x: -x[1])[0][0]