In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
from bs4 import BeautifulSoup
import re

In [2]:
data=pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')

In [3]:
value_dict={'toxic':1,'insult':5,'obscene':3,'severe_toxic':6,'identity_hate':8,'threat':10}

In [4]:
for i in value_dict.keys():
    data[i]=data[i]*value_dict[i]

In [5]:
processed_data=pd.DataFrame({'text':data['comment_text'],'toxic_value':data[list(value_dict.keys())].sum(axis=1)})

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [7]:
processed_data['text'] = processed_data['text'].apply(text_cleaning)
vectorizer=TfidfVectorizer(stop_words='english',ngram_range=(1,3))
text=vectorizer.fit_transform(processed_data['text'])

In [8]:
model=lgb.LGBMRegressor(n_estimators=1500,learning_rate=0.1,max_depth=9,num_leaves=512)

In [9]:
model.fit(text,processed_data['toxic_value'])

LGBMRegressor(max_depth=9, n_estimators=1500, num_leaves=512)

In [10]:
import pickle
filename='model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [11]:
filename2='vectorizer.pkl'
pickle.dump(vectorizer, open(filename2, 'wb'))