In [None]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import nltk
from nltk.corpus import stopwords
import re

# Load dataset
df = pd.read_csv("corrected_bass_ab.csv")
df["class"] = df["class"].astype(int)
# Convert labels to binary (1 = Offensive, 0 = Safe)
df["class"] = df["class"].apply(lambda x: 1 if x < 2 else 0)
def clean_tweet(text):
    text = re.sub(r"^.*?:", "", text)  # Remove everything before and including ":"
    text = re.sub(r"@\w+", "", text)   # Remove usernames
    text = re.sub(r"[!]+", "", text)   # Remove excessive exclamation marks
    return text.strip()  # Remove extra spaces

# Apply cleaning function to tweets
df["tweet"] = df["tweet"].astype(str).apply(clean_tweet)

# Extract sentences and labels
X = df["tweet"].astype(str)  
y = df["class"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Adjust TF-IDF parameters
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=3, max_df=0.9)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)




# Handle class imbalance (Convert to NumPy array)
weights = class_weight.compute_class_weight("balanced", classes=np.array([0, 1]), y=y_train)


# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight={0: weights[0], 1: weights[1]})
model.fit(X_train_tfidf, y_train)

# Save model and vectorizer
with open("text_moderation_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print(" Model retrained and saved!")


 Model retrained and saved!


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english")  # Removes common words
X_train_vectorized = vectorizer.fit_transform(X_train)

# Train model again
model.fit(X_train_vectorized, y_train)

# Define a threshold to filter out weakly contributing words


# Save model and vectorizer
import pickle
with open("text_moderation_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_[0]

# Get top 20 most offensive words (high positive weights)
top_offensive = sorted(zip(feature_names, coefs), key=lambda x: x[1], reverse=True)[:20]

print("Top words classified as offensive:", top_offensive)



Top words classified as offensive: [('bitch', 16.31248768829093), ('bitches', 11.540111805522885), ('pussy', 9.715875128450381), ('hoes', 8.768403702056759), ('hoe', 6.936608045366642), ('faggot', 5.471119885875881), ('shit', 4.687399193140654), ('niggah', 4.3860162652348125), ('cunt', 4.2402923575806035), ('fucking', 4.186268818796362), ('fuck', 4.15218670353745), ('nigger', 4.06052354200833), ('fag', 3.9785558714820466), ('ass', 3.2346930039940642), ('pussies', 3.226810103266661), ('faggots', 3.048478272716155), ('nigga', 2.9644926651835757), ('retarded', 2.888932967386003), ('nigguh', 2.7991220273327775), ('niccas', 2.7747934560266567)]


In [None]:
import pandas as pd
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("corrected_bass_ab.csv")

# Convert labels to binary (1 = Offensive, 0 = Safe)
df["class"] = df["class"].apply(lambda x: 1 if x < 2 else 0)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df["tweet"].astype(str), df["class"], test_size=0.2, random_state=42)

# Load trained model and vectorizer
with open("text_moderation_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("vectorizer.pkl", "rb") as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)



# Transform test data
X_test_tfidf = vectorizer.transform(X_test)

# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f" Model Accuracy: {accuracy:.4f}")

# Print detailed classification report
print("\n🔹 Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\n🔹 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


 Model Accuracy: 0.9409

🔹 Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      1143
           1       0.98      0.94      0.96      3818

    accuracy                           0.94      4961
   macro avg       0.90      0.94      0.92      4961
weighted avg       0.95      0.94      0.94      4961


🔹 Confusion Matrix:
[[1087   56]
 [ 237 3581]]


In [21]:
import nltk
nltk.download('stopwords')
import pickle
import numpy as np
from nltk.corpus import stopwords

# Load trained model and vectorizer
with open("text_moderation_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("vectorizer.pkl", "rb") as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

stop_words = set(stopwords.words("english"))    

importance_threshold = 1.0  # Adjust as needed




# Function to remove stopwords from input text
def clean_text(sentence):
    words = sentence.split()  # Tokenize sentence
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return " ".join(filtered_words)

# Function to detect and censor offensive words
def moderate_sentence(sentence):
    cleaned_text = clean_text(sentence)  # Remove stopwords
    text_vectorized = vectorizer.transform([cleaned_text])  # Convert to vector
    prediction = model.predict(text_vectorized)[0]  # Predict if sentence is offensive

    # Get words and their importance
    feature_names = np.array(vectorizer.get_feature_names_out())
    word_importance = model.coef_[0]  # Coefficients for offensive class

    # Find offensive words
    offensive_words = []
    for word in cleaned_text.split():
        if word in feature_names:
            word_index = np.where(feature_names == word)[0][0]  # Get index in vectorizer
            word_score = word_importance[word_index]  # Get model weight
            if word_score > 0.6:  # **Threshold for offensive words**
                offensive_words.append(word)

    # Censor offensive words in the original sentence
    words = sentence.split()
    censored_sentence = " ".join(["*" * len(word) if word.lower() in offensive_words else word for word in words])

    return censored_sentence, offensive_words

# Test cases
test_sentences = [
    "I love you",
    "she is a bitch",
    "Shut up, you are a fucking hoe",
    "hello man"
]

print("\n🔹 Moderated Sentences:")
for sentence in test_sentences:
    moderated_sentence, offensive_words = moderate_sentence(sentence)
    print(f"Original: {sentence}")
    print(f"Moderated: {moderated_sentence}")
    print(f"Offensive Words: {offensive_words if offensive_words else 'None'}\n")



🔹 Moderated Sentences:
Original: I love you
Moderated: I love you
Offensive Words: None

Original: she is a bitch
Moderated: she is a *****
Offensive Words: ['bitch']

Original: Shut up, you are a fucking hoe
Moderated: Shut up, you are a ******* ***
Offensive Words: ['fucking', 'hoe']

Original: hello man
Moderated: hello man
Offensive Words: None



[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\janhv\Desktop\minor1\.venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
