In [1]:
import yaml
import logging
from module import Preprocessor, Predictor

FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger('global_logger')

with open('./config/config.yaml', 'r') as config_file:
    try:
        config = yaml.safe_load(config_file)
        preprocessor = Preprocessor(config['preprocessing'], logger)
        data_x, data_y, train_x, train_y, validate_x, validate_y, test_x = preprocessor.process()
    except yaml.YAMLError as err:
        logger.warning('Config file err: {}'.format(err))

In [11]:
train_y[:, 0]

array([0, 0, 0, ..., 0, 0, 0])

In [45]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
import numpy as np

class NaiveBayer(object):
    def __init__(self, classes):
        self.models = {}
        self.classes = classes
        for cls in self.classes:
            model = MultinomialNB()
            self.models[cls] = model

    def fit(self, train_x, train_y):
        for idx, cls in enumerate(self.classes):
            class_labels = train_y[:,idx]
            self.models[cls].fit(train_x, class_labels)

    def predict(self, test_x):
        predictions = np.zeros((test_x.shape[0], len(self.classes)))
        for idx, cls in enumerate(self.classes):
            predictions[:, idx] = self.models[cls].predict(test_x)
        return predictions

    def predict_prob(self, test_x):
        probs = np.zeros((test_x.shape[0], len(self.classes)))
        for idx, cls in enumerate(self.classes):
            probs[:, idx] = self.models[cls].predict_proba(test_x)[:,1]
        return probs


class Trainer(object):
    def __init__(self, config, logger, classes):
        self.config = config
        self.logger = logger
        self.classes = classes
        self._create_model(classes)

    def _create_model(self, classes):
        if self.config['model_name'] == 'naivebayse':
            self.model = NaiveBayer(classes)
        else:
            self.logger.warning("Model Type:{} is not support yet".format(self.config['model_name']))

    def fit_and_validate(self, train_x, train_y, validate_x, validate_y):
        self.model.fit(train_x, train_y)
        predictions = self.model.predict(validate_x)
        #self.metrics(predictions, validate_y)
        return self.model, predictions

    def metrics(self, predictions, labels):
        accuracy = accuracy_score(labels, predictions)
        self.logger.info("Validate Accuracy:{}".format(accuracy))
        cls_report = classification_report(labels, predictions)
        self.logger.info("{}".format(cls_report))

    def fit(self, train_x, train_y):
        self.model.fit(train_x, train_y)
        return self.model

trainer = Trainer(config['training'], logger, config['preprocessing']['classes'])

In [46]:
model, predictions = trainer.fit_and_validate(train_x, train_y, validate_x, validate_y)

In [61]:
validate_labels = np.delete(validate_y, -1, 1)
print(validate_labels.shape)
accuracy = accuracy_score(validate_labels, predictions)

(47872, 6)


In [71]:
print(classification_report(validate_labels, predictions,zero_division=1))

              precision    recall  f1-score   support

           0       0.79      0.60      0.68      4591
           1       0.42      0.41      0.42       485
           2       0.74      0.59      0.65      2527
           3       0.14      0.03      0.05       131
           4       0.68      0.53      0.59      2362
           5       0.31      0.10      0.15       430

   micro avg       0.72      0.55      0.62     10526
   macro avg       0.51      0.38      0.43     10526
weighted avg       0.71      0.55      0.61     10526
 samples avg       0.98      0.95      0.93     10526



In [48]:
model = trainer.fit(data_x, data_y)

In [54]:
import csv

class Predictor(object):
    def __init__(self, config, logger, model):
        self.config = config
        self.logger = logger
        self.model = model

    def predict(self, test_x):
        predictions = self.model.predict(test_x)
        return predictions

    def predict_prob(self, test_x):
        prob = self.model.predict_prob(test_x)
        return prob

    def save_result(self, test_ids, probs):
        with open(self.config['output_path'], 'w') as output_csv_file:
             header = ['id','toxic','severe_toxic','obscene','threat','insult','identity_hate']
             writer = csv.writer(output_csv_file)
             writer.writerow(header)
             for test_id, prob in zip(test_ids, probs.tolist()):
                 writer.writerow([test_id] + prob)


In [55]:
predictor = Predictor(config['predict'], logger, model)
probs = predictor.predict_prob(test_x)
predictor.save_result(preprocessor.test_ids, probs)

In [14]:
predictor.save_result()

0


In [2]:
from tensorflow import keras
from keras import layers