In [1]:
import numpy as np
from model.helper import calculate_metrics
from sklearn.linear_model import LogisticRegression
from data.preprocess import load_from_file
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
name_format = "fcv-%s"

data = []
for i in range(10):
    data.append(load_from_file(name_format % i))

In [3]:
def evaluate(pred_scores, target):
    pred = np.argmax(pred_scores, axis=1)
    print(pred.shape)
    precision, recall, f1, accuracy = calculate_metrics(target, pred, classificationReport=True, num_classes=3)
    return precision ,recall, f1

In [4]:
def lr_with_freq(data):
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(2,5), analyzer="char")),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LogisticRegression(multi_class="multinomial", solver="lbfgs"))])    
    text_clf.fit(data["x_train"], data["y_train"])
    print("train")
    evaluate(text_clf.predict_proba(data["x_train"]), data["y_train"])
    print("test")
    evaluate(text_clf.predict_proba(data["x_test"]), data["y_test"])
    
    return text_clf

## Char-ngram Logistic Regression

In [5]:
for i in range(10):
    print("\n**cv-%s**" % i)
    lr_with_freq(data[i])


**cv-0**
train
(16515,)
             precision    recall  f1-score   support

       none     0.8740    0.9742    0.9214     11185
     racism     0.9075    0.7253    0.8062      1853
     sexism     0.9303    0.6868    0.7902      3477

avg / total     0.8896    0.8857    0.8808     16515

test
(1836,)
             precision    recall  f1-score   support

       none     0.8357    0.9453    0.8871      1243
     racism     0.8151    0.5777    0.6761       206
     sexism     0.8380    0.6150    0.7094       387

avg / total     0.8339    0.8344    0.8260      1836


**cv-1**
train
(16515,)
             precision    recall  f1-score   support

       none     0.8757    0.9750    0.9227     11185
     racism     0.9076    0.7264    0.8070      1853
     sexism     0.9325    0.6917    0.7943      3477

avg / total     0.8912    0.8874    0.8826     16515

test
(1836,)
             precision    recall  f1-score   support

       none     0.8266    0.9397    0.8795      1243
     racism  

### SVM

In [6]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


In [15]:
def svm(data):
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(2,5), analyzer="char")),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='log', penalty='l2'))])    
    text_clf.fit(data["x_train"], data["y_train"])
    print("train")
    evaluate(text_clf.predict_proba(data["x_train"]), data["y_train"])
    print("test")
    evaluate(text_clf.predict_proba(data["x_test"]), data["y_test"])
    
    return text_clf

In [16]:
for i in range(10):
    print("\n**cv-%s**" % i)
    svm(data[i])


**cv-0**
train
(16515,)
             precision    recall  f1-score   support

       none     0.8167    0.9705    0.8870     11185
     racism     0.8689    0.5936    0.7054      1853
     sexism     0.9055    0.5096    0.6522      3477

avg / total     0.8412    0.8312    0.8172     16515

test
(1836,)
             precision    recall  f1-score   support

       none     0.8061    0.9630    0.8776      1243
     racism     0.8306    0.5000    0.6242       206
     sexism     0.8811    0.5168    0.6515       387

avg / total     0.8246    0.8170    0.8015      1836


**cv-1**
train
(16515,)
             precision    recall  f1-score   support

       none     0.8196    0.9709    0.8889     11185
     racism     0.8722    0.5931    0.7061      1853
     sexism     0.9082    0.5234    0.6641      3477

avg / total     0.8441    0.8343    0.8210     16515

test
(1836,)
             precision    recall  f1-score   support

       none     0.8027    0.9461    0.8685      1243
     racism  