In [1]:
# ðŸš€ STEP 0 â€” Install Required Libraries
!pip install sentence-transformers scikit-learn pandas joblib



In [2]:
# ðŸš€ STEP 1 â€” Load Dataset
import pandas as pd

df = pd.read_csv("train.csv")

print(df.head())
print(df.label.value_counts())


                     text          label
0  Thank you so much bhai  not offensive
1            bhen ke lode      offensive
2            bh3n k3 l0d3      offensive
3         madarchod saale      offensive
4         madrch0d harami      offensive
label
offensive        695
not offensive    671
Name: count, dtype: int64


In [3]:
# ðŸš€ STEP 2 â€” Basic Cleaning (Minimal Needed)
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)


In [4]:
# ðŸš€ STEP 3 â€” Encode Labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])

le.classes_, df.head()


(array(['not offensive', 'offensive'], dtype=object),
                      text          label              clean_text  label_id
 0  Thank you so much bhai  not offensive  thank you so much bhai         0
 1            bhen ke lode      offensive            bhen ke lode         1
 2            bh3n k3 l0d3      offensive            bh3n k3 l0d3         1
 3         madarchod saale      offensive         madarchod saale         1
 4         madrch0d harami      offensive         madrch0d harami         1)

In [5]:
# ðŸš€ STEP 4 â€” Generate Embeddings (bge-m3 mini model)
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-small-en-v1.5")   # light + accurate

embeddings = model.encode(df["clean_text"].tolist(), show_progress_bar=True)


Batches:   0%|          | 0/43 [00:00<?, ?it/s]

In [6]:
# ðŸš€ STEP 5 â€” Train SVM Classifier
from sklearn.svm import SVC

svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(embeddings, df["label_id"])


In [7]:
# ðŸš€ STEP 6 â€” Evaluate Accuracy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    embeddings, df["label_id"], test_size=0.2, random_state=42
)

svm_temp = SVC(kernel="linear", probability=True)
svm_temp.fit(X_train, y_train)

preds = svm_temp.predict(X_test)
acc = accuracy_score(y_test, preds)

print("Accuracy:", acc)


Accuracy: 0.9744525547445255


In [8]:
# ðŸš€ STEP 7 â€” Save Model + Label Encoder
import joblib

joblib.dump(svm_model, "svm_embeddings_model.joblib")
joblib.dump(le, "label_encoder.joblib")
model.save("embedding_model")  # saves embedding model folder


In [9]:
test_text = input("Enter Exapamples : ")

clean = clean_text(test_text)
embed = model.encode([clean])
pred_id = svm_model.predict(embed)[0]
pred_label = le.inverse_transform([pred_id])[0]

print("Prediction:", pred_label)


Enter Exapamples :  aniket bada admi hai re baba pr chutiya be hai


Prediction: offensive
