In [1]:
import numpy as np
import pandas as pd
import sklearn
import pickle
import re
from bs4 import BeautifulSoup

In [2]:
model= pickle.load(open('../input/baseline-toxic/model.pkl', 'rb'))
vectorizer=pickle.load(open('../input/baseline-toxic/vectorizer.pkl', 'rb'))

In [3]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [4]:
valid=pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [5]:
text=valid['text'].apply(text_cleaning)

In [6]:
text=vectorizer.transform(text)

In [7]:
out=model.predict(text)

In [8]:
sub=pd.DataFrame({"comment_id":valid["comment_id"],"score":out})

In [9]:
sub.to_csv("submission.csv",index=False)