In [1]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input/jigsaw-toxic-comment-classification-challenge/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip


In [3]:
train_data = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
test_data = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip")

In [4]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [6]:
print(train_data.shape)
print(test_data.shape)

(159571, 8)
(153164, 2)


In [7]:
train_data.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [9]:
train_data.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

# Text cleaning 

1. Removing punctuation 
2. Removing Stop words 
3. Stemming the words 

In [10]:
import string
import re
import nltk
stopwords = nltk.corpus.stopwords.words("english")
from nltk.stem import PorterStemmer
ps = PorterStemmer()

# Tokenize 

In [11]:
def remove_punc(text):
    word = "".join([char.lower() for char in text if char not in string.punctuation])
    return word

train_data["removed_punch"] = train_data['comment_text'].apply(lambda x : remove_punc(x))
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,removed_punch
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour im seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really not trying to edit war its j...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,\nmore\ni cant make any real suggestions on im...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...


In [12]:
def token(text):
    word = re.split("\W+",text)
    return word
train_data["token_word"] = train_data['removed_punch'].apply(lambda x : token(x))
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,removed_punch,token_word
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...,"[explanation, why, the, edits, made, under, my..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour im seem...,"[daww, he, matches, this, background, colour, ..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really not trying to edit war its j...,"[hey, man, im, really, not, trying, to, edit, ..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,\nmore\ni cant make any real suggestions on im...,"[, more, i, cant, make, any, real, suggestions..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...,"[you, sir, are, my, hero, any, chance, you, re..."


# Text Vectorization 

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
word_vector = TfidfVectorizer(tokenizer=token,analyzer='word',max_features=1000)

In [15]:
train_vectorization = word_vector.fit_transform(train_data['comment_text'])
test_vectorization = word_vector.fit_transform(test_data['comment_text'])

In [16]:
train_vectorization.shape

(159571, 1000)

In [17]:
test_vectorization.shape

(153164, 1000)

In [18]:
# Creating DataFrame 
train_vectorization_df = pd.DataFrame(train_vectorization.toarray(), columns=word_vector.get_feature_names())
test_vectorization_df = pd.DataFrame(test_vectorization.toarray(), columns=word_vector.get_feature_names())

In [19]:
y_train = train_data[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
X_train = train_vectorization_df
X_test = test_vectorization_df

# Machine Learning model 

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
target_label = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

In [22]:
predicted = np.zeros((X_test.shape[0],y_train.shape[1]))
predicted

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [23]:
for i,label in enumerate(target_label):
    lr = LogisticRegression(C=2,random_state = i,class_weight = 'balanced')
    print('Building {} model for column:{''}'.format(i,label)) 
    lr.fit(X_train,y_train[label])

Building 0 model for column:toxic


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Building 1 model for column:severe_toxic


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Building 2 model for column:obscene


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Building 3 model for column:threat


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Building 4 model for column:insult


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Building 5 model for column:identity_hate


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [24]:
from sklearn.metrics import classification_report
label = 'insult'
y_pred = lr.predict(X_train)
print(classification_report(y_train[label],y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96    151694
           1       0.38      0.65      0.48      7877

    accuracy                           0.93    159571
   macro avg       0.68      0.80      0.72    159571
weighted avg       0.95      0.93      0.94    159571



In [25]:
for i in target_label:
    print(" Lable ",i,classification_report(y_train[i],y_pred))

 Lable  toxic               precision    recall  f1-score   support

           0       0.95      0.96      0.95    144277
           1       0.56      0.49      0.53     15294

    accuracy                           0.91    159571
   macro avg       0.76      0.73      0.74    159571
weighted avg       0.91      0.91      0.91    159571

 Lable  severe_toxic               precision    recall  f1-score   support

           0       1.00      0.92      0.96    157976
           1       0.10      0.86      0.18      1595

    accuracy                           0.92    159571
   macro avg       0.55      0.89      0.57    159571
weighted avg       0.99      0.92      0.95    159571

 Lable  obscene               precision    recall  f1-score   support

           0       0.98      0.95      0.96    151122
           1       0.40      0.63      0.49      8449

    accuracy                           0.93    159571
   macro avg       0.69      0.79      0.72    159571
weighted avg       0.95

In [26]:
y_predicted_labels = lr.predict_proba(X_train)[:,1]
y_predicted_labels

array([0.0013808 , 0.00279463, 0.00117447, ..., 0.00273129, 0.00025304,
       0.00053853])

# ROC 

In [27]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train['insult'], y_predicted_labels)
metrics.auc(fpr, tpr)

0.905579234074054

# Submission

In [28]:
test_predicted = pd.DataFrame(predicted,columns=y_train.columns)
submission = pd.concat([test_data['id'],test_predicted],axis=1)
submission.to_csv('submit.csv',index=False)