In [None]:
import pandas as pd
import numpy as np
import fasttext

In [None]:
df = pd.read_csv('train.csv').iloc[: , 1:]

In [None]:
df.head()

In [None]:
cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
df['label'] = df[cols].any(axis = 1).astype(int)

In [None]:
df.head()

In [None]:
df['label'].value_counts()

In [None]:
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [None]:
import unicodedata
from emoji import demojize

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
def text_cleaning(text_data):

  # Remove accented characters
  text_data = unicodedata.normalize('NFKD', text_data).encode('ascii', 'ignore').decode('utf-8', 'ignore')

  # Case conversion
  text_data = text_data.lower()

  # Demojize
  text_data = demojize(text_data)

  # Reducing repeated punctuations
  pattern_punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
  text_data = pattern_punct.sub(r'\1', text_data)
  
  # Prevent redundant replacements of single-space with single-space
  text_data = re.sub(' {2,}',' ', text_data)
  
  # Remove special characters
  text_data = re.sub(r"[^a-zA-Z?!]+", ' ', text_data)

  # Converting text to strings
  text_data = str(text_data)

  # Tokenization
  tokenizer = ToktokTokenizer()
  text_data = tokenizer.tokenize(text_data)

  # Removing stopwords
  text_data = [item for item in text_data if item not in stop_words]
  
  # Lemmatization
  text_data = [lemmatizer.lemmatize(word = w, pos = 'v') for w in text_data]
  
  # Convert list of tokens to string data type
  text_data = ' '.join (text_data)

  return text_data

In [None]:
df['comment_text'] = df['comment_text'].apply(text_cleaning)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train = df
y = df['label']

In [None]:
train['label'] = '__label__' + train['label'].astype(str)
train['label'] = train['label'] + ' ' + train['comment_text']
train.head()

In [None]:
train = train[['label']]

In [None]:
import csv

In [None]:
train.to_csv('train.txt', 
             index = False, 
             sep = ' ',
             header = None, 
             quoting = csv.QUOTE_NONE, 
             quotechar = "", 
             escapechar = " ")

In [None]:
model = fasttext.train_supervised('train.txt')

In [None]:
model.save_model("profanity_model_eng.bin")