# Jigsaw Rate Severity of Toxic Comments

In [21]:
import re
import gc
import numpy as np
import pandas as pd
from copy import deepcopy
from string import printable
import scipy

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import Ridge, ElasticNet

In [22]:
# Some constants that we use in several parts of our notebook
RANDOM_STATE = 201
STOPWORDS = set(STOPWORDS)

## Strategy ⁉
There are three datasets introduced in the competition page. I will be using all three to build an Emsemble model. Datasets used in the notebook:
- [jigsaw-toxic-comment-classification-challenge](https://www.kaggle.com/julian3833/jigsaw-toxic-comment-classification-challenge)
- [jigsaw-unintended-bias-in-toxicity-classification](https://www.kaggle.com/julian3833/jigsaw-unintended-bias-in-toxicity-classification)

## Weights
Here I have defined a dictionary that will map toxicity types to their corresponding weights. These weights are one of the most important parameters in the entire notebook.

In [23]:
# Toxicity weights - These weights are later used to combine all toxicity types into one
toxicity_weights = {
    'toxic': 1,
    'severe_toxic': 2,
    'obscene': 1,
    'threat': 1,
    'insult': 1,
    'identity_hate': 2,
    'sexual_explicit': 1
}

toxicity_types = list(toxicity_weights.keys())

## DownSampling
A function to downsample a given dataframe by a *threshold*.

In [24]:
def downsample(df, threshold, col = 'toxicity', cutoff_weight = 1.5):
    
    # Create cutoff
    cutoff = int((df[col] > threshold).sum() * cutoff_weight)

    # Crate downsampled df
    downsampled_df = df[df[col] <= threshold].sample(cutoff, random_state = RANDOM_STATE)

    # Concatenate and return the two dataframes
    return pd.concat([downsampled_df, df[df[col] > threshold]])

### Text Cleaning
As newer language models and techniques come into play, text-cleaning is becoming less and less necessary and more like an option to include in our proces. But let's not forget that text-cleaning can still be of great importance in many models and scenarios. I have defined a number of functions that will help clean parts of our texts and have later on used a few I believed to be the most helpful of all.

In [25]:
HTML_TAG_PATTERN = r"<.*?>"
EMAIL_PATTERN = r'(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])'
URL_PATTERN = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"

def remove_html_tags(string: str, replace_with: str = '') -> str:
    return re.sub(pattern = HTML_TAG_PATTERN, repl = replace_with, string = string)

def remove_special_characters(string: str) -> str:
    return ''.join(filter(lambda x: x in printable, string))

def remove_urls(string: str, replace_with: str = '') -> str:
    return re.sub(pattern = URL_PATTERN, repl = replace_with, string = string)

def remove_emails(string: str, replace_with: str = '') -> str:
    return re.sub(EMAIL_PATTERN, replace_with, string)

def remove_repeated_punctuations(string: str) -> str:
    def replacement(match):
        match = match.group()
        return match[0] + (" " if " " in match else "")
    return re.sub(r'[!\"#$%&\'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~ ]{2,}', replacement, string)

# Removes times and IP addresses
def remove_IPs(text):
    return re.sub(r'(([0-9]+\.){2,}[0-9]+)', '', text)        # 71.228.77.211

def remove_times(text):
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2} [a-zA-Z]+,? \d{4} \(UTC\)', '', text)    # 04:09, 11 Jul, 2003  
    text = re.sub(r'\d{1,2}:\d{2},? [a-zA-Z]+ \d{1,2},? \d{4} \(UTC\)', '', text)    # 16:47, Jul 23, 2004
    text = re.sub(r'\d{1,2}:\d{2},? \d{4} [a-zA-Z]+ \d{1,2},? \(UTC\)', '', text)    # 22:07, 2004 Dec 30
    text = re.sub(r'\d{1,2} [a-zA-Z]+ \d{4},? \d{1,2}:\d{2} \(UTC\)', '', text)      # 29 June 2005 22:08
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2} [a-zA-Z]+,?', '', text)                  # 21:31, 6 April
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2},?', '', text)                            # 17:52, 12
    text = re.sub(r'\d{1,2}:\d{1,2}-\d{1,2}-\d{1,2}', '', text)                      # 01:05-09-09    
    text = re.sub(r'\d{1,2}:\d{2}', '', text)                                        # 17:52, 12  
    
    text = re.sub(r'\(UTC\)', '', text)                                              # (UTC)
    return text

# Replace repeating characters more than 3 times to length of 3
def shorten_repeated_patterns(text):
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    
    # Add space around repeated characters
    text = re.sub(r'[ ]{2,}',' ', text).strip()
    text = re.sub(r'([*!?\']+)',r' \1 ', text)
    return text

In [26]:
def clean_text(text):
    text = remove_special_characters(text)
    text = shorten_repeated_patterns(text)
    text = remove_html_tags(text)
    text = remove_emails(text)
    text = remove_urls(text)
    text = remove_times(text)
    rext = remove_IPs
    return text

## Validation Function
We don't have the straight-forward validation data as we normally have, so we must come on with a method to validate our models. I will be using *validation.csv* which has two columns: *less_toxic* and *more_toxic*.

I will predict on each of the two columns and then computer the **RMSE** and the **Accuracy** metrics. This will be done using StratifiedKFold to ensure our results are as accurate as possible. ([Why Stratified?](https://stackoverflow.com/questions/65318931/stratifiedkfold-vs-kfold-in-scikit-learn))

**NOTE #1**: *Accuracy* can be misleading and is not a recommended metrics as our data is strongly unbalanced! ([why?](https://machinelearningmastery.com/failure-of-accuracy-for-imbalanced-class-distributions/)) That's why I also used the RMSE.

In [27]:
# Performs a Stratified K-Fold validation with a given pipeline
def kfold_validate(pipe, folds, X, y, less_toxic, more_toxic, verbose = False):
    
    accuracies, rmse_scores = [], []
    skf = StratifiedKFold(
        n_splits = folds,
        shuffle = True,
        random_state = RANDOM_STATE
    )
    
    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train, y_train = X[train_index], y[train_index]
        X_val, y_val = X[val_index], y[val_index]
        
        # Copy the original pipeline for each fold (This avoids fitting on the same pipeline multiple times)
        _pipe = deepcopy(pipe)
        _pipe.fit(X_train, y_train)
        
        # Calculate RMSE
        rmse_score = mean_squared_error(_pipe.predict(X_val), y_val, squared = False) 
        rmse_scores.append(rmse_score)
        
        # Calculate accuracy
        accuracy = (_pipe.predict(less_toxic) < _pipe.predict(more_toxic)).mean()
        accuracies.append(accuracy)
        
        if verbose:
            print(f"FOLD #{fold + 1}: Accuracy: {accuracy}, RMSE: {rmse_score}")
        
    return np.array(accuracies).mean(), np.array(rmse_scores).mean()

## Visualization
Since I'll (probably) be using multiple datasets in this notebook and run pretty much the same analysis over them, I'll define a few methods to avoid code duplication.

In [28]:
# Plots number of values for each toxicity level in the given dataframe
def plot_toxic_types_dist(df):    
    fig = plt.figure(figsize = (20, 5))
    plt.title('Toxicity Categories Count')
    plt.bar([type for type in toxicity_types if type in jtc_df.columns], [df[type].value_counts()[1] for type in toxicity_types if type in df.columns], label = 'Number of occurrences')
    plt.legend()
    plt.show()


# Plots the didtribution of values in toxicity columns of the given dataframe
def plot_toxicity_dist(df):
    toxicity_values = df['toxicity'].value_counts()
    
    plt.figure(figsize = (20, 5))
    plt.title('Toxicity Level Distribution')
    plt.bar(toxicity_values.keys(), toxicity_values.values, color = 'g')
    plt.show()

# Plots the wordcloud for each toxicity level of the given data frame (Stopwords are removed)
def plot_wordcloud(df):
    wordcloud = WordCloud(stopwords = STOPWORDS)
    fig, ax = plt.subplots(3, 2, figsize = (20, 10))

    i = 0
    for row in ax:
        for col in row:        
            wordcloud.generate(' '.join(df.loc[df[toxicity_types[i]] != 0, 'text'].tolist()))
            col.set_title(toxicity_types[i])        
            col.imshow(wordcloud)        
            col.axis("off")
            i += 1
    plt.tight_layout(pad = 0)
    plt.show()

## Jigsaw Rate Severity of Toxic Comments
This is our original dataset for the competition. The columns are:
- *comment_to_score.csv*: The dataset that is used for the final predictions.
- *validation_data.csv*: The dataset that is used to validate the models.
- *sample_submission.csv*: A sample submission file.

**STEP #1**: Our validation contain duplicate (*less_toxic*, *more_toxic*) pairs. This won't be problematic for our metrics (metric improvement matters not its specific value), but I will remove the duplicates anyway.

**STEP #2**: I will remove the *email addresses*, *html tags*, *URLs*, *times* and *IP addresses*.

In [29]:
val_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
test_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

print(f'test_df\n- Shape: {test_df.shape}\n- Columns: {list(test_df.columns)}')
print(f'- Duplicates: {test_df.duplicated(subset = "text").sum()}\n')

print(f'val_df\n- Shape: {val_df.shape}\n- Columns: {list(val_df.columns)}')
print(f'- Duplicates: {val_df.duplicated(subset = ["less_toxic", "more_toxic"]).sum()}!')

In [30]:
# # Get the dupicate items
# vals_duplicate_df = val_df[['less_toxic', 'more_toxic']]

# # Drop the duplicate paires except the first occurrence (Remove the worker column as well)
# val_df = vals_duplicate_df.loc[~vals_duplicate_df.duplicated(keep = 'first')]

# print(f"- New shape: {val_df.shape}")

In [31]:
# val_df
val_df['less_toxic'] = val_df['less_toxic'].apply(clean_text)
val_df['more_toxic'] = val_df['more_toxic'].apply(clean_text)

# test_df
test_df['text'] = test_df['text'].apply(clean_text)

## jigsaw toxic comment classification challenge
The *jigsaw-toxic-comment-train.csv* contains data from *train.csv* and *test.csv* from the *jigsaw-toxic-comment-classification-challenge*. (The test data and their corresponding labels have been merged, then both sets are concatenated)

**NOTE #1**: I will be changing the columns names to match the original dataset columns' names.

In [32]:
jtc_df = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv').rename(
    columns = { 'id': 'comment_id', 'comment_text': 'text'}
)

print(f'jtc_df\n- Shape: {jtc_df.shape}')
print(f'- Columns: {list(jtc_df.columns)}')
print(f'- Duplicates: {jtc_df.duplicated("text").sum()}')

In [33]:
# Combine all toxicity levels into one with the same weights set
jtc_df['toxicity'] = sum([jtc_df[type] * coef for type, coef in toxicity_weights.items() if type in jtc_df])

# Standardize toxicity
# jtc_df['toxicity'] = jtc_df['toxicity'] / jtc_df['toxicity'].max()

### Downsampling & Text-Cleaning
Our data is heavily unblanaced ([why is that bad?](https://machinelearningmastery.com/what-is-imbalanced-classification/)) and we must fix it. There are a few tricks we can pull off:
- The weights can be adjusted in a way to try balance out the data (Not recommended - We have enough data for downsampling, don't sacrifice your weights for balancing the data!)
- Downsampling can drop the portion of data from the problematic side (Most effective)

In [34]:
# Downsample
jtc_df = downsample(
    df = jtc_df,
    threshold = 0,
    col = 'toxicity',
    cutoff_weight = 1.5
)
print(f"- New shape: {jtc_df.shape}")

# Clean
jtc_df['text'] = jtc_df['text'].apply(clean_text)

### Exploratory Data Analysis
Explore fruther the  **jigsaw-toxic-comment-classification-challenge** datast using the following functions:

In [35]:
# plot_toxic_types_dist(jtc_df)
# plot_toxicity_dist(jtc_df)
# plot_wordcloud(jtc_df)

### Creating the Pipeline

In [36]:
X_train = jtc_df['text']
y_train = jtc_df['toxicity']
X_test = test_df['text']

In [37]:
# pipe = Pipeline([
#     ('vect', TfidfVectorizer(analyzer = 'char_wb')),
#     ('ridge', Ridge())
# ])

# # CV Cross Validation
# cv_scores = cross_val_score(
#     estimator = pipe,
#     X = X_train,
#     y = y_train,
#     cv = 5,
#     n_jobs = -1
# )
# print(f"Average CV Score: {cv_scores.mean()}")

# # Grid Search
# param_grid = {
#     'vect__max_df': np.concatenate([np.linspace(0, 1, 11), range(1, 10, 1)]),
#     'vect__min_df': np.concatenate([np.linspace(0, 1, 11), range(1, 10, 1)]),
#     'vect__ngram_range': [(i, j) for i in range(1, 7) for j in range(1, 7) if j > i]
# }

# grid = GridSearchCV(pipe, cv = 3, param_grid = param_grid, n_jobs = -1, verbose = 1)
# grid.fit(X_train,y_train)
# print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

In [39]:
# Define pipeline
pipe = Pipeline([
    ('vect', TfidfVectorizer(analyzer = 'char_wb', max_df = 0.5, min_df = 3, ngram_range = (3, 5))),
    ('ridge', Ridge())
])

# Validate (Pipeline must not be fitted!)
acc_mean, rmse_mean = kfold_validate(
    pipe = pipe,
    folds = 7,
    X = np.array(X_train),
    y = np.array(y_train),
    less_toxic = val_df['less_toxic'],
    more_toxic = val_df['more_toxic'],
    verbose = True
)
print(f"Mean Accuracy: {acc_mean}\nMean RMSE: {rmse_mean}")

In [40]:
# pipe['vect'].get_feature_names()

# sorted(list(zip(pipe['vect'].get_feature_names(), np.round(pipe['ridge'].coef_, 2))), 
#     key = lambda x: x[1], 
#     reverse = True
# )[:10]

### Creating the Submission

In [41]:
# Train the pipeline
pipe.fit(X_train, y_train)

# Make predictions
y_pred = pipe.predict(X_test)

# Rank the predictions to avoid ties
y_pred = scipy.stats.rankdata(y_pred, method = 'ordinal')

# Create submission file
submission_df = pd.DataFrame(data = {
    'comment_id': test_df['comment_id'],
    'score': y_pred
}).to_csv('submission.csv', index = False)