# Citation: Code inspired by https://www.kaggle.com/code/jhoward/nb-svm-strong-linear-baseline
# Data source: https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge


In [1]:
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re, string


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label].max(axis=1)
train.describe()


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()


In [4]:
n = train.shape[0]
vec = TfidfVectorizer(tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode')
trn_term_doc = vec.fit_transform(train['comment_text'])
test_term_doc = vec.transform(test['comment_text'])


In [5]:
def pr(y_i, y): # Basic Naive Bayes feature equation
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(y): #Fit a model each labels
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r


In [8]:
from sklearn.linear_model import LogisticRegression

x = trn_term_doc
test_x = test_term_doc
preds = np.zeros((len(test), len(label)))
array = []
for i, j in enumerate(label):
    print('fit', j) # Some labels reaches max iterations 
    m,r = get_mdl(train[j])
    array.append([m,r])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


fit toxic


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fit severe_toxic


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fit obscene


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fit threat
fit insult


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


fit identity_hate


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
def predict_from_txt(txt): # The function use to predict. 
    # Return the highest probability
    vtxt = vec.transform([txt])
    likely_label = np.zeros((len(txt), len(label)))

    for i, j in enumerate(label):
        m,r = array[i]
        likely_label[:,i] = m.predict_proba(vtxt.multiply(r))[:,1] 
    return likely_label[0]

def get_label(predsx,threshold=0.1): #Get the label. 
    #Threshold can be adjusted to change the sensitivity
    out = predsx.tolist()
    min = 0
    lab = "not toxic"
    for i in range(len(out)):
        if out[i] > min and out[i]>threshold:
            min = out[i]
            lab = label[i]
    print(lab,min)

def classify(txt):
    predsx = predict_from_txt(txt)
    get_label(predsx)


In [11]:
text = "I hate you"
classify(text)

toxic 0.9846344956744573
