In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [47]:
# Load the dataset
df = pd.read_csv('sus_comm.csv')
# Split the data into training and testing sets
X = df['comments']
y = df['tagging']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)


In [48]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [49]:
# Create a Naive Bayes classifier
clf = MultinomialNB()

# Train the classifier on the TF-IDF features
clf.fit(X_train_tfidf, y_train)
# Make predictions on the test set
y_pred = clf.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.72
              precision    recall  f1-score   support

           0       0.71      0.91      0.80      2453
           1       0.74      0.42      0.54      1548

    accuracy                           0.72      4001
   macro avg       0.73      0.67      0.67      4001
weighted avg       0.72      0.72      0.70      4001



In [52]:
# Example usage of the trained model for new predictions
new_text = ["I will fuck you.", "This is a good girl."]
new_text_tfidf = tfidf_vectorizer.transform(new_text)
new_predictions = clf.predict(new_text_tfidf)
print("Predictions for new text:")
for text, prediction in zip(new_text, new_predictions):
    label = "Suspicious" if prediction == 1 else "Not Suspicious"
    print(f'Text: "{text}" => Prediction: {label}')

Predictions for new text:
Text: "You are a damn fucker." => Prediction: Not Suspicious
Text: "This is a good girl." => Prediction: Not Suspicious
