In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = 'data/toxic/'
train = pd.read_csv(f'{PATH}train.csv').fillna(' ')
test = pd.read_csv(f'{PATH}test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    ngram_range=(1, 1),
    max_features=20000,tokenizer=tokenize,
               min_df=3, max_df=0.9, use_idf=1,
               smooth_idf=1)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=20000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

scores_logit = []
scores_svm = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier1 = LogisticRegression(solver='sag')
    classifier2 = SVC()

    cv_score1 = np.mean(cross_val_score(classifier1, train_features, train_target, cv=3, scoring='roc_auc'))
    cv_score2 = np.mean(cross_val_score(classifier2, train_features, train_target, cv=3, scoring='roc_auc'))

    scores_logit.append(cv_score1)
    scores_svm.append(cv_score2)
    print('CV score for logit class {} is {}'.format(class_name, cv_score1))
    print('CV score for svm class {} is {}'.format(class_name, cv_score2))

    classifier1.fit(train_features, train_target)
    classifier2.fit(train_features, train_target)

    submission[class_name] = (classifier1.predict_proba(test_features)[:, 1]+classifier2.predict_proba(test_features)[:, 1])/2

print('Total CV score for logit is {}'.format(np.mean(scores_logit)))
print('Total CV score for svm is {}'.format(np.mean(scores_svm)))

#submission.to_csv('submission_logisticplusSVM.csv', index=False)

In [None]:
submission.to_csv('submission_logisticplusSVM.csv', index=False)

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = 'data/toxic/'
train = pd.read_csv(f'{PATH}train.csv').fillna(' ')
test = pd.read_csv(f'{PATH}test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=5000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [12]:
train_word_features[1]
train[class_names].head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(train_word_features, train[class_names])

# predict
predictions = classifier.predict(train_word_features)

accuracy_score(train[class_names],predictions)

In [None]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())

classifier.fit(train_word_features, train[class_names])

# predict
predictions = classifier.predict(train_word_features)



In [4]:
from sklearn.metrics import accuracy_score
accuracy_score(train[class_names],predictions)

0.5203263750932187

In [5]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

# train
classifier.fit(train_word_features, train[class_names])

# predict
predictions = classifier.predict(train_word_features)

accuracy_score(train[class_names],predictions)

0.37495534902958555

In [None]:
from skmultilearn.adapt import MLkNN

classifier = MLkNN(k=6)

# train
classifier.fit(train_word_features, train[class_names])

# predict
predictions = classifier.predict(train_word_features)

accuracy_score(train[class_names],predictions)

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
PATH = 'data/toxic/'
train = pd.read_csv(f'{PATH}train.csv').fillna(' ')
test = pd.read_csv(f'{PATH}test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=5000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=5000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission_logistic.csv', index=False)