In [1]:
#https://www.kaggle.com/coldfir3/tokenizer-training-tfidf-ridge-lb-0-860
#https://www.kaggle.com/vitaleey/tfidf-ridge

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import progressbar
import nltk
import matplotlib.pyplot as plt
import re

from tqdm.auto import tqdm
from bs4 import BeautifulSoup

from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [3]:
TRAIN_DATA_PATH = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv"
TEST_DATA_PATH = "/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv"
VALID_DATA_PATH = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
SAMPLE_SUBMISSION = "/kaggle/input/jigsaw-toxic-severity-rating/sample_submission.csv"

In [4]:
df_train = pd.read_csv(TRAIN_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)
df_validation_data = pd.read_csv(VALID_DATA_PATH)
df_sample_submission = pd.read_csv(SAMPLE_SUBMISSION)

In [5]:
# Create a score that messure how much toxic is a comment
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

# cat_mtpl = {'obscene': 3, 'toxic': 4, 'threat': 4, 
#             'insult': 2, 'severe_toxic': 4, 'identity_hate': 2}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)
df_train['y'] = df_train['score']

df_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=116)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
df_train_new

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,0.32,1.5,0.16,0.0,0.64,0.0,2.62,2.62
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,0.32,0.0,0.00,0.0,0.00,0.0,0.32,0.32
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",0.32,0.0,0.00,0.0,0.00,0.0,0.32,0.32
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,0.32,0.0,0.16,0.0,0.64,1.5,2.62,2.62
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",0.32,0.0,0.16,0.0,0.64,0.0,1.12,1.12
...,...,...,...,...,...,...,...,...,...,...
100165,182e8baf67f78c47,:Zak Hansen\nWelcome to Wikipedia! We could re...,0.00,0.0,0.00,0.0,0.00,0.0,0.00,0.00
141019,f291cbe3bc03ac7d,21 and 23 SAS are not part of the SAS as if th...,0.00,0.0,0.00,0.0,0.00,0.0,0.00,0.00
88541,ecdac37e135384de,There is nothing 2 discuss. Krivocheev's casua...,0.00,0.0,0.00,0.0,0.00,0.0,0.00,0.00
125978,a1bc3bdc4c08ccf0,Why should I give you any benefit of the doubt...,0.00,0.0,0.00,0.0,0.00,0.0,0.00,0.00


In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

#import string

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    url = re.compile(r'https?://\S+|www\.\S+')  # Removes website links
    text = url.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml')  # Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) # Remove special Charecters
    text = re.sub(' +', ' ', text) # Remove Extra Spaces
    text = text.strip().lower() # remove spaces at the beginning and at the end of string and make string lower
    
    #table=str.maketrans("","",string.punctuation)
    #text=text.translate(table)
    
    # lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    # del stopwords
    text = ' '.join([word for word in text.split(' ') if word not in stop])

    return text

In [8]:
def clean(data, col):
    
    data[col] = data[col].str.replace('https?://\S+|www\.\S+', ' social medium ')      
        
    data[col] = data[col].str.lower()
    data[col] = data[col].str.replace("4", "a") 
    data[col] = data[col].str.replace("2", "l")
    data[col] = data[col].str.replace("5", "s") 
    data[col] = data[col].str.replace("1", "i") 
    data[col] = data[col].str.replace("!", "i") 
    data[col] = data[col].str.replace("|", "i") 
    data[col] = data[col].str.replace("0", "o") 
    data[col] = data[col].str.replace("l3", "b") 
    data[col] = data[col].str.replace("7", "t") 
    data[col] = data[col].str.replace("7", "+") 
    data[col] = data[col].str.replace("8", "ate") 
    data[col] = data[col].str.replace("3", "e") 
    data[col] = data[col].str.replace("9", "g")
    data[col] = data[col].str.replace("6", "g")
    data[col] = data[col].str.replace("@", "a")
    data[col] = data[col].str.replace("$", "s")
    data[col] = data[col].str.replace("#ofc", " of fuckin course ")
    data[col] = data[col].str.replace("fggt", " faggot ")
    data[col] = data[col].str.replace("your", " your ")
    data[col] = data[col].str.replace("self", " self ")
    data[col] = data[col].str.replace("cuntbag", " cunt bag ")
    data[col] = data[col].str.replace("fartchina", " fart china ")    
    data[col] = data[col].str.replace("youi", " you i ")
    data[col] = data[col].str.replace("cunti", " cunt i ")
    data[col] = data[col].str.replace("sucki", " suck i ")
    data[col] = data[col].str.replace("pagedelete", " page delete ")
    data[col] = data[col].str.replace("cuntsi", " cuntsi ")
    data[col] = data[col].str.replace("i'm", " i am ")
    data[col] = data[col].str.replace("offuck", " of fuck ")
    data[col] = data[col].str.replace("centraliststupid", " central ist stupid ")
    data[col] = data[col].str.replace("hitleri", " hitler i ")
    data[col] = data[col].str.replace("i've", " i have ")
    data[col] = data[col].str.replace("i'll", " sick ")
    data[col] = data[col].str.replace("fuck", " fuck ")
    data[col] = data[col].str.replace("f u c k", " fuck ")
    data[col] = data[col].str.replace("shit", " shit ")
    data[col] = data[col].str.replace("bunksteve", " bunk steve ")
    data[col] = data[col].str.replace('wikipedia', ' social medium ')
    data[col] = data[col].str.replace("faggot", " faggot ")
    data[col] = data[col].str.replace("delanoy", " delanoy ")
    data[col] = data[col].str.replace("jewish", " jewish ")
    data[col] = data[col].str.replace("sexsex", " sex ")
    data[col] = data[col].str.replace("allii", " all ii ")
    data[col] = data[col].str.replace("i'd", " i had ")
    data[col] = data[col].str.replace("'s", " is ")
    data[col] = data[col].str.replace("youbollocks", " you bollocks ")
    data[col] = data[col].str.replace("dick", " dick ")
    data[col] = data[col].str.replace("cuntsi", " cuntsi ")
    data[col] = data[col].str.replace("mothjer", " mother ")
    data[col] = data[col].str.replace("cuntfranks", " cunt ")
    data[col] = data[col].str.replace("ullmann", " jewish ")
    data[col] = data[col].str.replace("mr.", " mister ")
    data[col] = data[col].str.replace("aidsaids", " aids ")
    data[col] = data[col].str.replace("njgw", " nigger ")
    data[col] = data[col].str.replace("wiki", " social medium ")
    data[col] = data[col].str.replace("administrator", " admin ")
    data[col] = data[col].str.replace("gamaliel", " jewish ")
    data[col] = data[col].str.replace("rvv", " vanadalism ")
    data[col] = data[col].str.replace("admins", " admin ")
    data[col] = data[col].str.replace("pensnsnniensnsn", " penis ")
    data[col] = data[col].str.replace("pneis", " penis ")
    data[col] = data[col].str.replace("pennnis", " penis ")
    data[col] = data[col].str.replace("pov.", " point of view ")
    data[col] = data[col].str.replace("vandalising", " vandalism ")
    data[col] = data[col].str.replace("cock", " dick ")
    data[col] = data[col].str.replace("asshole", " asshole ")
    data[col] = data[col].str.replace("youi", " you ")
    data[col] = data[col].str.replace("afd", " all fucking day ")
    data[col] = data[col].str.replace("sockpuppets", " sockpuppetry ")
    data[col] = data[col].str.replace("iiprick", " iprick ")
    data[col] = data[col].str.replace("penisi", " penis ")
    data[col] = data[col].str.replace("warrior", " warrior ")
    data[col] = data[col].str.replace("loil", " laughing out insanely loud ")
    data[col] = data[col].str.replace("vandalise", " vanadalism ")
    data[col] = data[col].str.replace("helli", " helli ")
    data[col] = data[col].str.replace("lunchablesi", " lunchablesi ")
    data[col] = data[col].str.replace("special", " special ")
    data[col] = data[col].str.replace("ilol", " i lol ")
    data[col] = data[col].str.replace(r'\b[uU]\b', 'you')
    data[col] = data[col].str.replace(r"what's", "what is ")
    data[col] = data[col].str.replace(r"\'s", " is ")
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace('\s+', ' ')  # will remove more than one whitespace character
#     text = re.sub(r'\b([^\W\d_]+)(\s+\1)+\b', r'\1', re.sub(r'\W+', ' ', text).strip(), flags=re.I)  # remove repeating words coming immediately one after another
    data[col] = data[col].str.replace(r'(.)\1+', r'\1\1') # 2 or more characters are replaced by 2 characters
#     text = re.sub(r'((\b\w+\b.{1,2}\w+\b)+).+\1', r'\1', text, flags = re.I)
    data[col] = data[col].str.replace("[:|♣|'|§|♠|*|/|?|=|%|&|-|#|•|~|^|>|<|►|_]", '')
    
    
    data[col] = data[col].str.replace(r"what's", "what is ")    
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace(r"\'s", " ")
    
    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    data[col] = data[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    return data

In [9]:
#tqdm.pandas()
#df_train_new['clean_text'] = df_train_new['comment_text'].progress_apply(text_cleaning)
df_train_new = clean(df_train_new, 'comment_text')

In [10]:
tqdm.pandas()
df_train_new['clean_text'] = df_train_new['comment_text'].progress_apply(text_cleaning)

  0%|          | 0/32450 [00:00<?, ?it/s]

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV,KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgbm
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import Ridge, LogisticRegression, RidgeCV, ElasticNet, SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import StackingRegressor, VotingRegressor

In [12]:
# df_train_new = df_train_new.reset_index(drop=True)
labels = df_train_new['y']
comments = df_train_new['clean_text']

vectorizer = TfidfVectorizer(min_df=3, max_df=0.5, analyzer='char_wb',ngram_range=(3,5))#https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer
comments_tr = vectorizer.fit_transform(comments)
comments_tr

<32450x86655 sparse matrix of type '<class 'numpy.float64'>'
	with 9948508 stored elements in Compressed Sparse Row format>

In [13]:
models = [('ridge05', Ridge(random_state=42, alpha=0.5)),
         ('ridge03', Ridge(random_state=42, alpha=0.3)),
         ('linSVR', LinearSVR(random_state=42)),
          ('sgd', SGDRegressor(random_state=42)),
          ('ElaNet03',ElasticNet(random_state=42, alpha=0.3)),
         ('lgbm',lgbm.LGBMRegressor(random_state=42))]

"""
models = [('ridge', Ridge(random_state=42)),
        ('linSVR', LinearSVR(random_state=42)),
          ('sgd', SGDRegressor(random_state=42)),
          ('elanet',ElasticNet(random_state=42)),
         ('lgbm',lgbm.LGBMRegressor(random_state=42))]

params = {'ridge__alpha': [0.2,0.35,0.5],
      'elanet__alpha': [0.2,0.35,0.5]}
      #"lgbm__n_estimators":[5,10,25,50,100],
       # 'lgbm__num_leaves': [5,10,15,30],
        #"lgbm__max_depth":[5,10,50,100],
        #'lgbm__learning_rate': [0.01,0.05,0.1]
         #}

reg = VotingRegressor(estimators=models)
grid = GridSearchCV(estimator=reg, param_grid=params, cv=KFold(n_splits=3),n_jobs=-1)
grid.fit(comments_tr, labels)
print (grid.best_params_)
"""

'\nmodels = [(\'ridge\', Ridge(random_state=42)),\n        (\'linSVR\', LinearSVR(random_state=42)),\n          (\'sgd\', SGDRegressor(random_state=42)),\n          (\'elanet\',ElasticNet(random_state=42)),\n         (\'lgbm\',lgbm.LGBMRegressor(random_state=42))]\n\nparams = {\'ridge__alpha\': [0.2,0.35,0.5],\n      \'elanet__alpha\': [0.2,0.35,0.5]}\n      #"lgbm__n_estimators":[5,10,25,50,100],\n       # \'lgbm__num_leaves\': [5,10,15,30],\n        #"lgbm__max_depth":[5,10,50,100],\n        #\'lgbm__learning_rate\': [0.01,0.05,0.1]\n         #}\n\nreg = VotingRegressor(estimators=models)\ngrid = GridSearchCV(estimator=reg, param_grid=params, cv=KFold(n_splits=3),n_jobs=-1)\ngrid.fit(comments_tr, labels)\nprint (grid.best_params_)\n'

In [14]:
regressor = VotingRegressor(estimators=models)
regressor.fit(comments_tr, labels)

VotingRegressor(estimators=[('ridge05', Ridge(alpha=0.5, random_state=42)),
                            ('ridge03', Ridge(alpha=0.3, random_state=42)),
                            ('linSVR', LinearSVR(random_state=42)),
                            ('sgd', SGDRegressor(random_state=42)),
                            ('ElaNet03',
                             ElasticNet(alpha=0.3, random_state=42)),
                            ('lgbm', LGBMRegressor(random_state=42))])

In [15]:
# preprocess val data

tqdm.pandas()
df_validation_data = clean(df_validation_data, 'less_toxic')
df_validation_data = clean(df_validation_data, 'more_toxic')
df_validation_data['less_toxic'] = df_validation_data['less_toxic'].progress_apply(text_cleaning)
df_validation_data['more_toxic'] = df_validation_data['more_toxic'].progress_apply(text_cleaning)

less_toxic = vectorizer.transform(df_validation_data['less_toxic'])
more_toxic = vectorizer.transform(df_validation_data['more_toxic'])

# make predictions
y_pred_less = regressor.predict(less_toxic)
y_pred_more = regressor.predict(more_toxic)

(y_pred_less < y_pred_more).mean()

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

0.6795203932509633

In [16]:
df_test = clean(df_test, 'text')
df_test['text'] = df_test['text'].progress_apply(text_cleaning)

df_test['prediction'] = regressor.predict(vectorizer.transform(df_test['text']))
df_test = df_test[['comment_id','prediction']]

df_test['score'] = df_test['prediction']
df_test = df_test[['comment_id','score']]

df_test.to_csv('./submission.csv', index=False)

  0%|          | 0/7537 [00:00<?, ?it/s]