In [1]:
import numpy as np
from model.helper import calculate_metrics
from sklearn.linear_model import LinearRegression
from data.preprocess import load_from_file
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

Using TensorFlow backend.


In [2]:
sexism_data = load_from_file("sexism_final_binary")

print(sexism_data["x_train"].shape)
print(sexism_data["y_train"].shape)
print(sexism_data["x_test"].shape)
print(sexism_data["y_test"].shape)

(13033,)
(13033,)
(3259,)
(3259,)


In [3]:
racism_data = load_from_file("racism_final_binary")

print(racism_data["x_train"].shape)
print(racism_data["y_train"].shape)
print(racism_data["x_test"].shape)
print(racism_data["y_test"].shape)

(11589,)
(11589,)
(2898,)
(2898,)


In [4]:
abusive_data = load_from_file("abusive_final_binary")

print(abusive_data["x_train"].shape)
print(abusive_data["y_train"].shape)
print(abusive_data["x_test"].shape)
print(abusive_data["y_test"].shape)

(14558,)
(14558,)
(3642,)
(3642,)


In [5]:
def evaluate(pred_scores, target):
    pred = list(map(lambda x: 1 if x >= 0.5 else 0, pred_scores))
    precision, recall, f1 = calculate_metrics(target, pred)
    print("Training: Precision=%.2f Recall=%.2f, F1=%.2f" %(precision, recall, f1))
    return f1

In [8]:
def lr_with_occur(data):
    vect = CountVectorizer(ngram_range=(2,5), analyzer="char")
    X_train_counts = vect.fit_transform(data["x_train"])
    print(X_train_counts.shape)
    
    lr = LinearRegression().fit(X_train_counts, data["y_train"])
    evaluate(lr.predict(X_train_counts), data["y_train"])
    
    X_test_counts = vect.transform(data["x_test"])
    pred_scores = lr.predict(X_test_counts)
    evaluate(pred_scores, data["y_test"])
    return lr, pred_scores

In [9]:
def lr_with_freq(data):
    vect = CountVectorizer(ngram_range=(2,5), analyzer="char")
    X_train_counts = vect.fit_transform(data["x_train"])
    print(X_train_counts.shape)
    
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    lr = LinearRegression().fit(X_train_tfidf, data["y_train"])
    evaluate(lr.predict(X_train_tfidf), data["y_train"])
    
    X_test_counts = vect.transform(data["x_test"])
    pred_scores = lr.predict(tfidf_transformer.transform(X_test_counts))
    evaluate(pred_scores, data["y_test"])
    return lr, pred_scores

## Char-ngram Logistic Regression

### Sexism

In [10]:
lr_with_occur(sexism_data)

(13033, 209248)
Training: Precision=0.99 Recall=0.98, F1=0.98
Training: Precision=0.68 Recall=0.65, F1=0.66


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([-0.26742584,  0.08863401, -0.36218095, ...,  0.99999999,
         0.6603459 ,  0.54225431]))

In [11]:
lr_with_freq(sexism_data)

(13033, 209248)
Training: Precision=0.98 Recall=0.98, F1=0.98
Training: Precision=0.74 Recall=0.65, F1=0.69


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([-0.19096738,  0.33023047,  0.10044193, ...,  0.99999996,
         0.56496723,  0.29963298]))

### Racism

In [12]:
lr_with_occur(racism_data)

(11589, 190590)
Training: Precision=1.00 Recall=1.00, F1=1.00
Training: Precision=0.70 Recall=0.63, F1=0.66


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([ 0.31047233,  0.11712028, -0.05471729, ...,  0.5017374 ,
         0.585294  ,  0.9479977 ]))

In [13]:
lr_with_freq(racism_data)

(11589, 190590)
Training: Precision=1.00 Recall=1.00, F1=1.00
Training: Precision=0.74 Recall=0.64, F1=0.69


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([ 0.37266129,  0.03311203,  0.02875123, ...,  0.49127743,
         0.71657175,  0.8544037 ]))

### Abusive (combined)

In [11]:
lr_with_occur(abusive_data)

(14558, 217026)
Training: Precision=0.99 Recall=0.99, F1=0.99
Training: Precision=0.63 Recall=0.63, F1=0.63


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([ 0.07642635,  0.35287011,  0.96663645, ...,  0.2848371 ,
         0.89408181,  1.31403683]))

In [14]:
lr_with_freq(abusive_data)

(14558, 217026)
Training: Precision=0.99 Recall=0.99, F1=0.99
Training: Precision=0.66 Recall=0.64, F1=0.65


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 array([-0.01216694,  0.6522212 ,  0.66981284, ...,  0.22203411,
         0.84068746,  0.98236891]))