## NLP English profanity words

In [None]:
!pip install --upgrade nltk

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer # CountVectorizer[Bad Choice Acc=80%]
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import joblib
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jose-\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Read in data

In [3]:
data = pd.read_csv('English_profanity_words.csv')
data.head()

Unnamed: 0,is_offensive,text
0,0,Then go to the village pump and suggest they c...
1,1,ANTI GREEK NATIONALIS -WIKIPEDIA \n\nHi Alexik...
2,1,Dis hoe wasnt dis violent on Lottery Ticket 😂😂
3,0,It is better for Atabay not helping the banned...
4,0,"""is in CamelCase. """"SiCKO"""" is not CamelCase,..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184354 entries, 0 to 184353
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   is_offensive  184354 non-null  int64 
 1   text          184350 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.8+ MB


### Function to clean text

In [5]:
def clean_text(text):
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into text
    clean_text = ' '.join(tokens)
    return clean_text

### Apply cleaning function to 'text' column

In [6]:
data['text'] = data['text'].astype(str).apply(clean_text)
data.head()

Unnamed: 0,is_offensive,text
0,0,go village pump suggest change language rfc set
1,1,anti greek nationalis wikipedia hi alexikoua y...
2,1,dis hoe wasnt dis violent lottery ticket
3,0,better atabay helping banned vandal pushing pov
4,0,camelcase sicko camelcase camelcase rule r bal...


### Extract Features

In [7]:
texts = data['text']
y = data['is_offensive']

### Split the data into training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(texts, y, test_size=0.2, random_state=42)

### Vectorize the text

In [9]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

### Train the model

In [10]:
model = LinearSVC(class_weight="balanced", dual=False, tol=1e-2, max_iter=int(1e5))
cclf = CalibratedClassifierCV(model)
cclf.fit(X_train_vectorized, y_train)

### Evaluate the model

In [11]:
train_accuracy = cclf.score(X_train_vectorized, y_train)
test_accuracy = cclf.score(X_test_vectorized, y_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Train Accuracy: 0.9848931741285436
Test Accuracy: 0.9575818393859673


### Save the model

In [12]:
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(cclf, 'model.joblib')
print('Model Saved')

Model Saved


### Load the vectorizer and the model

In [14]:
vectorizer = joblib.load('vectorizer.joblib')
model = joblib.load('model.joblib')

text = ['fck you', 'wtf asshle' , 'how are you', 'dog live in the sea', 'but you still ugly hoe shut up']
txt = vectorizer.transform(text)
txt = model.predict(txt)

print(txt)

[1 1 0 0 1]
