In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import spacy

# Load the dataset
df = pd.read_csv("csv.csv")

# Under sampling taking random sample
min_sample = 50
df_bully = df[df.type_bully == "bully"].sample(min_sample, random_state=40)
df_non_bully = df[df.type_bully == "non-bully"].sample(min_sample, random_state=40)

# Combine the balanced dataframes
df_balanced = pd.concat([df_bully, df_non_bully], axis=0)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    df_balanced.bullying_words,
    df_balanced.type_bully,
    test_size=0.2,
    random_state=40,
    stratify=df_balanced.type_bully
)

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Define the preprocess function to handle NaN values
def preprocess(bullying_words):
    if pd.notnull(bullying_words):  
        doc = nlp(bullying_words)
        filtered_tokens = []
        for token in doc:
            if not token.is_stop and not token.is_punct and not token.like_num:
                filtered_tokens.append(token.lemma_)
        return ' '.join(filtered_tokens)
    else:
        return ''  

# Apply the preprocess function to create a new column 'processed_text'
df_balanced['processed_text'] = df_balanced['bullying_words'].apply(preprocess)

# Create a pipeline with TfidfVectorizer and LogisticRegression
lejone_tfidf = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1, 3))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Fit the pipeline on the training data
lejone_tfidf.fit(X_train, Y_train)

# Make predictions on the testing data
Y_pred_tfidf = lejone_tfidf.predict(X_test)

# Evaluate the TF-IDF based model
print(classification_report(Y_test, Y_pred_tfidf))


              precision    recall  f1-score   support

       bully       0.69      0.90      0.78        10
   non-bully       0.86      0.60      0.71        10

    accuracy                           0.75        20
   macro avg       0.77      0.75      0.74        20
weighted avg       0.77      0.75      0.74        20



In [3]:
from sklearn import metrics
metrics.accuracy_score(Y_test, Y_pred_tfidf)*100

75.0