# Jigsaw Rate Severity of Toxic Comments

In [18]:
import re
import numpy as np
import pandas as pd
from string import printable, punctuation
from itertools import groupby 

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [19]:
# Since source of the training data is different, the column names are different as well.
# Let's adjust them to avoid confusion
train_df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
train_df.rename(columns = {
    'id': 'comment_id',
    'comment_text': 'text'
}, inplace = True)

# Original competition data
val_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
test_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

# Always shuffle to avoid unnecessary patterns in the data
train_df = train_df.sample(frac = 1).reset_index(drop = True)
val_df = val_df.sample(frac = 1).reset_index(drop = True)
test_df = test_df.sample(frac = 1).reset_index(drop = True)

train_df.head(5)

### EDA: Love at First Sight 👉👈
I must feel comfortable with the data I will be working on in order to operate well. So let's do a quick EDA on our data from the [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data) competition. Starting by plotting the count of each toxicity type:

In [20]:
toxicity_types = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
zeros, ones = [], []

for toxic_type in toxicity_types:
    value_counts = train_df[toxic_type].value_counts()
    zeros.append(value_counts[0])
    ones.append(value_counts[1])

fig = plt.figure(figsize = (20, 5))
plt.title('Toxicity Types count')
plt.bar(toxicity_types, ones)
plt.legend()
plt.show()

Looking at the columns, we have the *comment_text* and its corresponding toxicity types. By plotting the toxicity types values we see they don't quite match up. We could combine all toxicity types into one new feature called **toxicity** by summing their values (So called *Feature Engineering*).

**NOTE: What coefficients each type has is extremely important!**

In [21]:
toxicity_coefs = {
    'toxic': 1,
    'severe_toxic': 1,
    'obscene': 1,
    'threat': 1,
    'insult': 1,
    'identity_hate': 1
}


train_df['toxicity'] = sum([train_df[type] * coef for type, coef in toxicity_coefs.items()])

In [22]:
toxicity_values = train_df['toxicity'].value_counts()

plt.figure(figsize = (20, 5))
plt.title('Toxicity Level Distribution')
plt.bar(toxicity_values.keys(), toxicity_values.values, color = 'g')
plt.show()

### Downsampling: Balancing our data
Inbalanced data can be very problematic. Yes, we can do a few tricks but the problem pretty much stands. Best way to reduce this effect is to balance our data. (The *accuracy* metric is not reccommended for inbalanced data)

In [23]:
cutoff = toxicity_values[0] // 5

# Shuffle first to avoid any unnecessary patterns
train_df = train_df.sample(frac = 1).reset_index(drop = True)

# Perform cutoff
train_df = train_df.iloc[:cutoff]

### Text Cleaning
As language models improve, text-cleaning is becoming less necessary, but that's not the case for all models. My strategy to overcome this is to start simple and test some cleaning methods to see if they help the model or not.

In [24]:
HTML_TAG_PATTERN = r"<.*?>"
EMAIL_PATTERN = r'(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])'
URL_PATTERN = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
ABBR_VERB_DICT = {
    "aren't" : "are not",
    "arent" : "are not",
    "can't" : "cannot",
    "cant" : "cannot",
    "couldn't" : "could not",
    "couldnt" : "could not",
    "didn't" : "did not",
    "didnt" : "did not",
    "doesn't" : "does not",
    "doesnt" : "does not",
    "don't" : "do not",
    "dont" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "havent" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mightnt" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "shouldnt" : "should not",
    "shld": "should",
    "that's" : "that is",
    "thats" : "that is",
    "there's" : "there is",
    "theres" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "theyre":  "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",    
}

def remove_html_tags(string: str, replace_with: str = '') -> str:
    return re.sub(pattern = HTML_TAG_PATTERN, repl = replace_with, string = string)

def fix_verb_abbr(string: str) -> str:
    return ' '.join([ABBR_VERB_DICT[word.lower()] if (word.lower() in ABBR_VERB_DICT.keys()) else word for word in string.split()])

def remove_special_characters(string: str) -> str:
    return ''.join(filter(lambda x: x in printable, string))

def remove_urls(string: str, replace_with: str = '') -> str:
    return re.sub(pattern = URL_PATTERN, repl = replace_with, string = string)

def remove_emails(string: str, replace_with: str = '') -> str:
    return re.sub(EMAIL_PATTERN, replace_with, string)

def remove_punctuation(string: str, punctuations: str) -> str:    
    return string.translate(str.maketrans('', '', punctuations))

def remove_repeated_punctuations(string: str) -> str:
    def replacement(match):
        match = match.group()
        return match[0] + (" " if " " in match else "")
    return re.sub(r'[!\"#$%&\'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~ ]{2,}', replacement, string)


def clean_text(text):
    
    text = str(text)
    
     # Remove double quotations
    text = text.replace('""', '"')
    
    # Replace new lines (\n) with '.' and later remove consecutive repeated punctuations
    text = text.replace('\n', '. ')    
    
    text = remove_html_tags(text)                           # Remove HTML tags
    text = remove_emails(text)                              # Remove email addresses
    text = remove_urls(text)                                # Remove URLs
#     text = fix_verb_abbr(text)                              # Fix verb abbreviations    
    text = remove_special_characters(text)                  # Remove special characters
    text = remove_repeated_punctuations(text)               # Remove consecutive repeated punctuations    
    
    # Strip leading and trailin puctuations and white spaces
#     text = text.strip(punctuation).strip()
    
#     text = remove_punctuation(text, punctuations = '*~')    # Remove spesific puntuations
    
    return text

In [25]:
# for df in [train_df, test_df]:
#     df['text'] = df['text'].apply(clean_text)

In [26]:
# print(train_df['text'].values[:50])

### Validation function
We need some sort of validation for our model to monitor its behavior. One way to accomplish this is to use the data from *validation_data.csv*. We can use our model to predict on both *less_toxic* and *more_toxic* columns and comparing the results.

In [27]:
# Defining the validation function
def validate_model(model, vectorizer, less_toxic, more_toxic):
    
    # Vectorize
    less_toxic = vectorizer.transform(less_toxic)
    more_toxic = vectorizer.transform(more_toxic)
    
    # Make predictions
    prob_1 = model.predict_proba(less_toxic)
    prob_2 = model.predict_proba(more_toxic)
    
    return (prob_1[:, 1] < prob_2[:, 1]).mean()

### Balancing the Data

In [28]:
train_df['smoothed_toxicity'] = train_df['toxicity'].apply(lambda x: 1 if x > 0 else 0)

cutoff = train_df['smoothed_toxicity'].value_counts().min()
label_0 = train_df.loc[train_df['smoothed_toxicity'] == 0].sample(cutoff)
label_1 = train_df.loc[train_df['smoothed_toxicity'] == 1]
train_df = pd.concat([label_0, label_1])

### Vectorizing Data

In [29]:
vectorizer = TfidfVectorizer(stop_words = 'english')
X_train = vectorizer.fit_transform(train_df['text'])

y_train = train_df['smoothed_toxicity']

In [30]:
# Build the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Validate
validate_model(model, vectorizer, val_df['less_toxic'], val_df['more_toxic'])

### Creating Submission

In [31]:
X_test = vectorizer.transform(test_df['text'])
y_pred = model.predict_proba(X_test)

submission_df = pd.DataFrame(data = {
    'comment_id': test_df['comment_id'],
    'score': y_pred[:, 1]
}).to_csv('submission.csv', index = False)