In [2]:
from sklearn.model_selection import train_test_split
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier  # Import GBM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

# Load the dataset
df = pd.read_csv('mycsv.csv')

# Define a function to remove punctuation and convert text to lowercase
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun, "")
    text = text.lower()
    return text

# Apply the function to the 'bullying_words' column
df['bullying_words'] = df['bullying_words'].apply(remove_pun)

# Split the data into features (X) and target (Y)
X = df['bullying_words']
Y = df['type_bully']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Vectorize the text data
vec = TfidfVectorizer(ngram_range=(1, 2), analyzer='char')
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Create a Gradient Boosting Machine model
model_gbm = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_gbm.fit(X_train_vec, Y_train)

# Make predictions
predict_val = model_gbm.predict(X_test_vec)

# Calculate accuracy
accuracy = metrics.accuracy_score(Y_test, predict_val) * 100
print(f'Accuracy: {accuracy:.2f}%')

# Confusion matrix
conf_matrix = metrics.confusion_matrix(Y_test, predict_val)
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 75.00%
Confusion Matrix:
[[10  3]
 [ 1  2]]
