In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import re

In [2]:
!unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip

Archive:  ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
  inflating: train.csv               


In [3]:
data=pd.read_csv('train.csv')

In [4]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
data['toxic'].value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [6]:
data['severe_toxic'].value_counts()

0    157976
1      1595
Name: severe_toxic, dtype: int64

In [7]:
data['obscene'].value_counts()

0    151122
1      8449
Name: obscene, dtype: int64

In [8]:
data['threat'].value_counts()

0    159093
1       478
Name: threat, dtype: int64

In [9]:
data['insult'].value_counts()

0    151694
1      7877
Name: insult, dtype: int64

In [10]:
data['identity_hate'].value_counts()

0    158166
1      1405
Name: identity_hate, dtype: int64

In [11]:
!unzip ../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip

Archive:  ../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
  inflating: test_labels.csv         


In [12]:
labels=pd.read_csv('./test_labels.csv')

In [13]:
value_dict={'toxic':1,'insult':5,'obscene':3,'severe_toxic':6,'identity_hate':8,'threat':10}

In [14]:
for i in value_dict.keys():
    data[i]=data[i]*value_dict[i]

In [15]:
processed_data=pd.DataFrame({'text':data['comment_text'],'toxic_value':data[list(value_dict.keys())].sum(axis=1)})

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [18]:
processed_data['text'] = processed_data['text'].apply(text_cleaning)

In [19]:
vectorizer=TfidfVectorizer(stop_words='english',ngram_range=(1,3))

In [20]:
text=vectorizer.fit_transform(processed_data['text'])

In [21]:
from sklearn.linear_model import Ridge
model=Ridge()


In [22]:
model.fit(text,processed_data['toxic_value'])

Ridge()

In [23]:
valid=pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [24]:
valid_text=valid['text'].apply(text_cleaning)

In [25]:
valid_text=vectorizer.transform(valid_text)

In [26]:
out=model.predict(valid_text)

In [27]:
sub=pd.DataFrame({'comment_id':valid['comment_id'],'score':out})

In [28]:
import pickle

In [29]:
filename='model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [30]:
filename2='vectorizer.pkl'

In [31]:
pickle.dump(vectorizer, open(filename2, 'wb'))