In [1]:
import re
import gc
import scipy
import numpy as np
import pandas as pd
from copy import deepcopy
from string import printable
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import Ridge, ElasticNet

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Some constants that we use in several parts of our notebook
RANDOM_STATE = 201
STOPWORDS = set(STOPWORDS)

# Strategy
There are three datasets introduced in the competition page. I will be using all three to build an Emsemble model. Datasets used in the notebook:
- [jigsaw-toxic-comment-classification-challenge](https://www.kaggle.com/julian3833/jigsaw-toxic-comment-classification-challenge)
- [jigsaw-unintended-bias-in-toxicity-classification](https://www.kaggle.com/julian3833/jigsaw-unintended-bias-in-toxicity-classification)

## Weights
Here I have defined a dictionary that will map toxicity types to their corresponding weights. These weights are one of the most important parameters in the entire notebook.

In [3]:
# Toxicity weights - These weights are later used to combine all toxicity types into one
toxicity_weights = {
    'toxic': 1,
    'severe_toxic': 2,
    'obscene': 1,
    'threat': 1,
    'insult': 1,
    'identity_hate': 2,
    'sexual_explicit': 1
}

toxicity_types = list(toxicity_weights.keys())

## Down-Sampling
A function to downsample a given dataframe by a *threshold*.

In [4]:
def downsample(df, threshold, col = 'toxicity', cutoff_weight = 1.5):
    
    # Create cutoff
    cutoff = int((df[col] > threshold).sum() * cutoff_weight)

    # Crate downsampled df
    downsampled_df = df[df[col] <= threshold].sample(cutoff, random_state = RANDOM_STATE)

    # Concatenate and return the two dataframes
    return pd.concat([downsampled_df, df[df[col] > threshold]])

## Text Cleaning Methods
As newer language models and techniques come into play, text-cleaning is becoming less and less necessary and more like an option to include in our proces. But let's not forget that text-cleaning can still be of great importance in many models and scenarios. I have defined a number of functions that will help clean parts of our texts and have later on used a few I believed to be the most helpful of all.

In [5]:
HTML_TAG_PATTERN = r"<.*?>"
EMAIL_PATTERN = r'(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])'
URL_PATTERN = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"

def remove_html_tags(string: str, replace_with: str = '') -> str:
    return re.sub(pattern = HTML_TAG_PATTERN, repl = replace_with, string = string)

def remove_special_characters(string: str) -> str:
    return ''.join(filter(lambda x: x in printable, string))

def remove_urls(string: str, replace_with: str = '') -> str:
    return re.sub(pattern = URL_PATTERN, repl = replace_with, string = string)

def remove_emails(string: str, replace_with: str = '') -> str:
    return re.sub(EMAIL_PATTERN, replace_with, string)

def remove_repeated_punctuations(string: str) -> str:
    def replacement(match):
        match = match.group()
        return match[0] + (" " if " " in match else "")
    return re.sub(r'[!\"#$%&\'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~ ]{2,}', replacement, string)

# Removes times and IP addresses
def remove_IPs(text):
    return re.sub(r'(([0-9]+\.){2,}[0-9]+)', '', text)        # 71.228.77.211

def remove_times(text):
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2} [a-zA-Z]+,? \d{4} \(UTC\)', '', text)    # 04:09, 11 Jul, 2003  
    text = re.sub(r'\d{1,2}:\d{2},? [a-zA-Z]+ \d{1,2},? \d{4} \(UTC\)', '', text)    # 16:47, Jul 23, 2004
    text = re.sub(r'\d{1,2}:\d{2},? \d{4} [a-zA-Z]+ \d{1,2},? \(UTC\)', '', text)    # 22:07, 2004 Dec 30
    text = re.sub(r'\d{1,2} [a-zA-Z]+ \d{4},? \d{1,2}:\d{2} \(UTC\)', '', text)      # 29 June 2005 22:08
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2} [a-zA-Z]+,?', '', text)                  # 21:31, 6 April
    text = re.sub(r'\d{1,2}:\d{2},? \d{1,2},?', '', text)                            # 17:52, 12
    text = re.sub(r'\d{1,2}:\d{1,2}-\d{1,2}-\d{1,2}', '', text)                      # 01:05-09-09    
    text = re.sub(r'\d{1,2}:\d{2}', '', text)                                        # 17:52, 12  
    
    text = re.sub(r'\(UTC\)', '', text)                                              # (UTC)
    return text

# Replace repeating characters more than 3 times to length of 3
def shorten_repeated_patterns(text):
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    
    # Add space around repeated characters
    text = re.sub(r'[ ]{2,}',' ', text).strip()
    text = re.sub(r'([*!?\']+)',r' \1 ', text)
    return text

In [6]:
def clean_text(text):    
    text = remove_html_tags(text)
    text = remove_emails(text)
    text = remove_urls(text)
    text = remove_times(text)
    rext = remove_IPs
    text = remove_special_characters(text)
    text = shorten_repeated_patterns(text)
    return text

# Fit, Validate and Predict
We don't have the straight-forward validation data as we normally have, so we must come on with a method to validate the models. I will be using the *validation.csv* which has two columns: *less_toxic* and *more_toxic*. As the result we must calculate the *Accuracy*.

I will predict on each of the two columns and then computer **RMSE** and **Accuracy** metrics. This is better be done using StratifiedKFold to ensure that each fold of dataset has the same proportion of observations with a given labels. ([Read More](https://stackoverflow.com/questions/65318931/stratifiedkfold-vs-kfold-in-scikit-learn))


**NOTE #1**: *Accuracy* can be misleading and is not a recommended-metric here as our data is strongly unbalanced! ([why not?](https://machinelearningmastery.com/failure-of-accuracy-for-imbalanced-class-distributions/)) That's why I also use the RMSE.

In [7]:
# Calculate RMSE and Accuracy metrics
def validate(pipe, X_val, y_val):
    ''' Pipe must have been fitted before being passed to this function '''
    
    # RMSE
    rmse = mean_squared_error(pipe.predict(X_val), y_val, squared = False) 

    # Accuracy
    lt_pred = pipe.predict(val_df['less_toxic'])
    mt_pred = pipe.predict(val_df['more_toxic'])
    accuracy = (lt_pred < mt_pred).mean()
    
    return lt_pred, mt_pred, accuracy, rmse

In [8]:
def fit_validate_predict(pipe, X, y, folds = 5):
    
    # Created folds
    skf = KFold(
        n_splits = folds,
        shuffle = True, # Default is False
        random_state = RANDOM_STATE
    )
    accuracies, rmses = np.zeros(folds), np.zeros(folds)
    lt_preds, mt_preds = np.zeros((val_df.shape[0], folds)), np.zeros((val_df.shape[0], folds))
    preds = np.zeros((test_df.shape[0], folds))    
    
    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        
        # Split the data into train and test sets
        X_train, y_train = X[train_index], y[train_index]
        X_val, y_val = X[val_index], y[val_index]
        
        # Train the pipeline
        pipe.fit(X_train, y_train)
        
        # Validate the pipeline with test_df['text'] and y_val
        lt_pred, mt_pred, accuracy, rmse = validate(pipe, X_val, y_val)
        accuracies[fold], rmses[fold] = accuracy, rmse
        lt_preds[:, fold], mt_preds[:, fold] = lt_pred, mt_pred
        
        # Make predictions
        preds[:, fold] = pipe.predict(test_df['text'])
        
        print(f"FOLD #{fold + 1}) Accuracy: {accuracy.round(4)}, RMSE: {rmse.round(4)}")
    print(f"\n- Avg Accuracy: {accuracies.round(4).mean()}\n- Avg RMSE: {rmses.round(4).mean()}")
    
    return lt_preds, mt_preds, preds

## Visualizations
Since I'll (probably) be using multiple datasets in this notebook and run pretty much the same analysis over them, I'll define a few methods to avoid code duplication.

In [9]:
# Plots number of values for each toxicity level in the given dataframe
def plot_toxic_types_dist(df):    
    fig = plt.figure(figsize = (20, 5))
    plt.title('Toxicity Categories Count')
    plt.bar([type for type in toxicity_types if type in jtc_df.columns], [df[type].value_counts()[1] for type in toxicity_types if type in df.columns], label = 'Number of occurrences')
    plt.legend()
    plt.show()

# Plots the didtribution of values in toxicity columns of the given dataframe
def plot_toxicity_dist(df):
    toxicity_values = df['toxicity'].value_counts()
    
    plt.figure(figsize = (20, 5))
    plt.title('Toxicity Level Distribution')
    plt.bar(toxicity_values.keys(), toxicity_values.values, color = 'g')
    plt.show()

# Plots the wordcloud for each toxicity level of the given data frame (Stopwords are removed)
def plot_wordcloud(df):
    wordcloud = WordCloud(stopwords = STOPWORDS)
    fig, ax = plt.subplots(3, 2, figsize = (20, 10))

    i = 0
    for row in ax:
        for col in row:        
            wordcloud.generate(' '.join(df.loc[df[toxicity_types[i]] != 0, 'text'].tolist()))
            col.set_title(toxicity_types[i])        
            col.imshow(wordcloud)        
            col.axis("off")
            i += 1
    plt.tight_layout(pad = 0)
    plt.show()

# Jigsaw Rate Severity of Toxic Comments
This is our original dataset for the competition. The columns are:
- *comment_to_score.csv*: The dataset that is used for the final predictions.
- *validation_data.csv*: The dataset that is used to validate the models.
- *sample_submission.csv*: A sample submission file.

**STEP #1**: Our validation contain duplicate (*less_toxic*, *more_toxic*) pairs. This won't be problematic for our metrics (metric improvement matters not its specific value), but I will remove the duplicates anyway.

**STEP #2**: I will remove the *email addresses*, *html tags*, *URLs*, *times* and *IP addresses*.

In [10]:
val_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
test_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

print(f'test_df\n- Shape: {test_df.shape}\n- Columns: {list(test_df.columns)}')
print(f'- Duplicates: {test_df.duplicated(subset = "text").sum()}\n')

print(f'val_df\n- Shape: {val_df.shape}\n- Columns: {list(val_df.columns)}')
print(f'- Duplicates: {val_df.duplicated(subset = ["less_toxic", "more_toxic"]).sum()}!')

### Removing Duplicates in Validation Set
(Optional) It won't have any effect on our model (Because the improvements matters not the spesific value) but removing duplicate (*less_toxic*, *more_toxic*) pairs might be a good practice.

In [11]:
# # Get the dupicate items
# vals_duplicate_df = val_df[['less_toxic', 'more_toxic']]

# # Drop the duplicate paires except the first occurrence (Remove the worker column as well)
# val_df = vals_duplicate_df.loc[~vals_duplicate_df.duplicated(keep = 'first')]

# print(f"- New shape: {val_df.shape}")

### Text-Cleaning

In [12]:
# val_df
val_df['less_toxic'] = val_df['less_toxic'].apply(clean_text)
val_df['more_toxic'] = val_df['more_toxic'].apply(clean_text)

# test_df
test_df['text'] = test_df['text'].apply(clean_text)

# jigsaw toxic comment classification challenge
The *jigsaw-toxic-comment-train.csv* contains data from *train.csv* and *test.csv* from the *jigsaw-toxic-comment-classification-challenge*. (The test data and their corresponding labels have been merged, then both sets are concatenated)

**NOTE #1**: I will be changing the columns names to match the original dataset columns' names.

In [13]:
jtc_df = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv').rename(
    columns = { 'id': 'comment_id', 'comment_text': 'text'}
)

print(f'jtc_df\n- Shape: {jtc_df.shape}')
print(f'- Columns: {list(jtc_df.columns)}')
print(f'- Duplicates: {jtc_df.duplicated("text").sum()}')

### Combining Toxicity Types

In [14]:
# Combine all toxicity levels into one with the same weights set
jtc_df['toxicity'] = sum([jtc_df[type] * coef for type, coef in toxicity_weights.items() if type in jtc_df])

# Standardize toxicity (converts to continues values)
# jtc_df['toxicity'] = jtc_df['toxicity'] / jtc_df['toxicity'].max()

### Downsampling & Text-Cleaning
Our data is heavily unblanaced ([why is that bad?](https://machinelearningmastery.com/what-is-imbalanced-classification/)) and we must fix it. There are a few tricks we can pull off:
- The weights can be adjusted in a way to try balance out the data (Not recommended - We have enough data for downsampling, don't sacrifice your weights for balancing the data!)
- Downsampling can drop the portion of data from the problematic side (Most effective)

In [15]:
# Downsample
jtc_df = downsample(
    df = jtc_df,
    threshold = 0,
    col = 'toxicity',
    cutoff_weight = 1.5
)
print(f"- New shape: {jtc_df.shape}")

# Clean
jtc_df['text'] = jtc_df['text'].apply(clean_text)

### Exploratory Data Analysis
Explore fruther the  **jigsaw-toxic-comment-classification-challenge** datast using the following functions:

In [16]:
# plot_toxic_types_dist(jtc_df)
# plot_toxicity_dist(jtc_df)
# plot_wordcloud(jtc_df)

# ruddit jigsaw dataset
Third dataset used is the *ruddit-jigsaw-dataset* and spesificly the *ruddit_with_text.csv*. There are a few things worth paying attention:
- Deleted comments are marked as *[deleted]*. Do we keep them? If comment is deleted by the user then it won't have any useful information, but if it's deleted by the community, that would be thoughtful.
- I shifted the toxicity scores to be between 0 and 1

**NOTE #1**: The *offensiveness_score* is probably different that *toxicity*, but I will rename the column to match the other dataframes.

In [17]:
# Select only the columns we need
rjd_df = pd.read_csv('../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv').rename(
    columns = {'txt': 'text', 'offensiveness_score': 'toxicity'}
)[['comment_id', 'text', 'toxicity']]

# Change scale
rjd_df['toxicity'] = (rjd_df['toxicity'] - rjd_df['toxicity'].min()) / (rjd_df['toxicity'].max() - rjd_df['toxicity'].min()) 

print(f'rjd_df\n- Shape: {jtc_df.shape}')
print(f'- Columns: {list(jtc_df.columns)}')
print(f'- Duplicates: {jtc_df.duplicated("text").sum()}')

### Removing Invalid Entries & Text-Cleaning
Looking at the below histogram, there is not much pattern for the deleted comments and I will remove them entirely.

In [18]:
# Get duplicates texts
duplicates = rjd_df['text'].duplicated(keep = 'first')

# Plot distribution of toxicity scores for deleted texts
plt.figure(figsize = (10, 5))
plt.hist(rjd_df.loc[duplicates, 'toxicity'])
plt.show()

# Drop the deleted comments
rjd_df = rjd_df.loc[rjd_df['text'] != '[deleted]']
print(f"- New shape: {val_df.shape}")

# Text Cleaing
rjd_df['text'] = rjd_df['text'].apply(clean_text)

# Ensemble: Ridge() & TfidfVectorizer()
## Creating the Pipeline

In [19]:
features = FeatureUnion([
    ('vect', TfidfVectorizer(analyzer = 'char_wb', max_df = 0.5, min_df = 3, ngram_range = (3, 5))),
])

# Define pipeline
pipe = Pipeline([
    ("features", features),
    ('ridge', Ridge())
])

## Fit, Validate and make Prediction
I am going to do the followings in each fold:
1. Train the model
2. Validate the model (calculate *Accuracy* and *RMSE*)
3. Predict the model

Each fold uses a subset of the data (the model doesn't see the entire data all at once) and that might cause some problems; However, that's manageable by the number of *folds* we specify that can increase/decrease the amount of data the model works with in each fold.

Moreover, I will have multiple predictions depending on the number of *folds* which I will later sum up. This is a good practice because it reduces possible noises in the data (Although I do shuffle the data).

In [20]:
# jtc_df
jtc_lt_preds, jtc_mt_preds, jtc_preds = fit_validate_predict(
    pipe = pipe,
    X = np.array(jtc_df['text']),
    y = np.array(jtc_df['toxicity']),
    folds = 5
)

In [None]:
# rjd_df
rjd_lt_preds, rjd_mt_preds, rjd_preds = fit_validate_predict(
    pipe = pipe,
    X = np.array(rjd_df['text']),
    y = np.array(rjd_df['toxicity']),
    folds = 5
)

In [None]:
# # Grid Search
# param_grid = {
#     'vect__max_df': np.concatenate([np.linspace(0, 1, 11), range(1, 10, 1)]),
#     'vect__min_df': np.concatenate([np.linspace(0, 1, 11), range(1, 10, 1)]),
#     'vect__ngram_range': [(i, j) for i in range(1, 7) for j in range(1, 7) if j > i]
# }

# grid = GridSearchCV(pipe, cv = 3, param_grid = param_grid, n_jobs = -1, verbose = 1)
# grid.fit(X_train,y_train)
# print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

In [None]:
# pipe['vect'].get_feature_names()

# sorted(list(zip(pipe['vect'].get_feature_names(), np.round(pipe['ridge'].coef_, 2))), 
#     key = lambda x: x[1], 
#     reverse = True
# )[:10]

## Ensemble Modeling
### Finding optimal weights

I have used *scipy.optimizer* to find the optimal weights (See [documentations](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.brute.html)) by brute forcing the possible weights.

**NOTE #1**: When added a new dataset, append the corresponding *lt_preds* and *mt_preds* to the end of *params* list.

In [None]:
# parameters
params = (
    [jtc_lt_preds.mean(axis = 1), rjd_lt_preds.mean(axis = 1)],
    [jtc_mt_preds.mean(axis = 1), rjd_mt_preds.mean(axis = 1)]
)

# Function which must be minimized
def func(x, *param):
    return -1 * (sum([x[i] * params[0][i] for i in range(len(x))]) < sum([x[i] * params[1][i] for i in range(len(x))])).mean()

# Find optimized weights
resbrute = scipy.optimize.brute(
    func,
    ranges = ([slice(0, 1, 0.01) for _ in range(len(params))]),
    args = params,
    full_output = True,
    finish = None
)
print(f'- Optimal weights: {resbrute[0]}\n- Global Minimum: {resbrute[1] * -1}')

### Calculate Final Predictions
We have the optimal weights and we have the predictions and the final prediction can be calculated using the two.

**NOTE #1**: When added a new dataset, append the corresponding *_preds* to the end of *preds* list.

In [None]:
preds = [
    jtc_preds.mean(axis = 1),
    rjd_preds.mean(axis = 1)
]

# Multiply predictions and their corresponding weighs, then sum them up
y_pred = np.array([preds[i] * resbrute[0][i] for i in range(len(preds))]).sum(axis = 0)

## Creating the Submission
The predictions are first ranked to get rid of any ties.

In [None]:
# Remove ties
y_pred = scipy.stats.rankdata(y_pred, method = 'ordinal')

# Create submission file
submission_df = pd.DataFrame(data = {
    'comment_id': test_df['comment_id'],
    'score': y_pred
}).to_csv('submission.csv', index = False)