# Training Model

In [19]:
import numpy as np
import fasttext
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata
from emoji import demojize
import csv

In [20]:
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

In [21]:
def text_cleaning(text_data):

  # Remove accented characters
  text_data = unicodedata.normalize('NFKD', text_data).encode('ascii', 'ignore').decode('utf-8', 'ignore')

  # Case conversion
  text_data = text_data.lower()

  # Demojize
  text_data = demojize(text_data)

  # Reducing repeated punctuations
  pattern_punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
  text_data = pattern_punct.sub(r'\1', text_data)
  
  # Prevent redundant replacements of single-space with single-space
  text_data = re.sub(' {2,}',' ', text_data)
  
  # Remove special characters
  text_data = re.sub(r"[^a-zA-Z?!]+", ' ', text_data)

  # Converting text to strings
  text_data = str(text_data)

  # Tokenization
  tokenizer = ToktokTokenizer()
  text_data = tokenizer.tokenize(text_data)

  # Removing stopwords
  text_data = [item for item in text_data if item not in stop_words]
  
  # Lemmatization
  text_data = [lemmatizer.lemmatize(word = w, pos = 'v') for w in text_data]
  
  # Convert list of tokens to string data type
  text_data = ' '.join (text_data)

  return text_data

In [22]:
model = fasttext.train_supervised('train.txt')

In [23]:
model.save_model("profanity_model_urdu.bin")

# Testing Model

In [1]:
import pandas as pd
import numpy as np
import fasttext
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import unicodedata
from emoji import demojize

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def text_cleaning(text_data):

  # Remove accented characters
  text_data = unicodedata.normalize('NFKD', text_data).encode('ascii', 'ignore').decode('utf-8', 'ignore')

  # Case conversion
  text_data = text_data.lower()

  # Demojize
  text_data = demojize(text_data)

  # Reducing repeated punctuations
  pattern_punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
  text_data = pattern_punct.sub(r'\1', text_data)
  
  # Prevent redundant replacements of single-space with single-space
  text_data = re.sub(' {2,}',' ', text_data)
  
  # Remove special characters
  text_data = re.sub(r"[^a-zA-Z?!]+", ' ', text_data)

  # Converting text to strings
  text_data = str(text_data)

  # Tokenization
  tokenizer = ToktokTokenizer()
  text_data = tokenizer.tokenize(text_data)

  # Removing stopwords
  text_data = [item for item in text_data if item not in stop_words]
  
  # Lemmatization
  text_data = [lemmatizer.lemmatize(word = w, pos = 'v') for w in text_data]
  
  # Convert list of tokens to string data type
  text_data = ' '.join (text_data)

  return text_data

In [4]:
model = fasttext.load_model('profanity_model_urdu(2).bin')

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
user_input = ""
while user_input != "stop":
    user_input = input("Enter something: ")
    print("\nInput:", user_input)
    user_input = text_cleaning(user_input)

    labels, probabilities = model.predict(user_input, k=2)

    for label, probability in zip(labels, probabilities):
        if label[9:] == "1":
            print(f'NSFW: {round(probability*100, 1)}%')
        else:
            print(f' SFW: {round(probability*100, 1)}%')


Input: tum ghalat ho
NSFW: 97.7%
 SFW: 2.3%

Input: ghalat
NSFW: 97.2%
 SFW: 2.8%

Input: stop
 SFW: 100.0%
NSFW: 0.0%
