## ***Importation des bibliothèques***

In [63]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--

In [61]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import csr_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from collections import Counter
import re
import random
import matplotlib.pyplot as plt
import seaborn as sns
import html
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences


ModuleNotFoundError: No module named 'xgboost'

## ***Exploration des données***

In [21]:
df = pd.read_csv("labeled_data.csv")

ERROR! Session/line number was not unique in database. History logging moved to new session 345


In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [23]:
df.drop(["Unnamed: 0","count","hate_speech","offensive_language","neither"],axis=1,inplace=True)

In [24]:
df.tail()

Unnamed: 0,class,tweet
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies
24782,2,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...


In [25]:
map_dic = {0:"hate_speech",1:"offensive_language",2:"neither"}
df["class"]=df["class"].map(map_dic)

## ***Data Cleaning***

In [26]:
def process(message):
    nopunc = [char for char in message if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    return ' '.join(clean)

In [27]:
df["tweet"] = df["tweet"].apply(process)

In [28]:
def clean_tweet(tweet):
    # Unescape HTML entities (e.g., &amp; -> &, &#128536; -> emoji)
    tweet = html.unescape(tweet)
    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove hashtags (#hashtag)
    tweet = re.sub(r'#\w+', '', tweet)
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    # Remove emojis and non-ASCII characters
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')
    # Remove special characters, punctuation (except sentence enders), and digits
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)
    # Collapse multiple spaces into one
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

In [29]:
df['tweet'] = df['tweet'].apply(clean_tweet)

In [30]:
df["tweet"] = df["tweet"].str.lower()

In [31]:
df

Unnamed: 0,class,tweet
0,neither,rt mayasolovely woman shouldnt complain cleani...
1,offensive_language,rt mleew boy dats coldtyga dwn bad cuffin dat ...
2,offensive_language,rt urkindofbrand dawg rt sbabylife ever fuck b...
3,offensive_language,rt cganderson vivabased look like tranny
4,offensive_language,rt shenikaroberts shit hear might true might f...
...,...,...
24778,offensive_language,yous muthafin lie lifeasking pearls coreyemanu...
24779,neither,youve gone broke wrong heart baby drove rednec...
24780,offensive_language,young buck wanna eat dat nigguh like aint fuck...
24781,offensive_language,youu got wild bitches tellin lies


## ***Stemming***

In [32]:
porter = nltk.PorterStemmer()
def stem(message):
    stemmedArr = [porter.stem(term) for term in message.split(" ")]
    return ' '.join(stemmedArr)

In [33]:
df["tweet"]=df["tweet"].apply(stem)

In [34]:
df.head()

Unnamed: 0,class,tweet
0,neither,rt mayasolov woman shouldnt complain clean hou...
1,offensive_language,rt mleew boy dat coldtyga dwn bad cuffin dat h...
2,offensive_language,rt urkindofbrand dawg rt sbabylif ever fuck bi...
3,offensive_language,rt cganderson vivabas look like tranni
4,offensive_language,rt shenikarobert shit hear might true might fa...


# **Continuous Skip-gram model**

## ***Build Vocabulary and Generate Skip-Gram Pairs***

In [35]:
texts = df["tweet"]

In [36]:
words = ' '.join(texts).split()
vocab = Counter(words)
vocab_size = 10000  # top 10,000 words
most_common = vocab.most_common(vocab_size-1)

# Map words to integers
word2idx = {w: i+1 for i, (w, _) in enumerate(most_common)}
idx2word = {i: w for w, i in word2idx.items()}

def tokenize(text):
    return [word2idx[word] for word in text.split() if word in word2idx]

tokenized_texts = [tokenize(t) for t in texts]

# Generate skip-gram pairs
def generate_skip_gram_pairs(tokenized_sentences, window_size=2):
    pairs = []
    for tokens in tokenized_sentences:
        for idx, center_word in enumerate(tokens):
            context_range = list(range(max(0, idx - window_size), idx)) + \
                            list(range(idx + 1, min(len(tokens), idx + window_size + 1)))
            for context_idx in context_range:
                context_word = tokens[context_idx]
                pairs.append((center_word, context_word))
    return pairs

pairs = generate_skip_gram_pairs(tokenized_texts)

## ***Prepare Data for TensorFlow***

In [37]:
def generate_batch(pairs, batch_size):
    batch = random.sample(pairs, batch_size)
    x = np.array([i for i, j in batch])
    y = np.array([[j] for i, j in batch])
    return x, y

## ***Define Skip-Gram Model***

In [38]:
embedding_dim = 128

class SkipGramModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embed = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, x):
        x = self.embed(x)
        x = self.dense(x)
        return x

model = SkipGramModel(vocab_size, embedding_dim)
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))


## ***Train the Model***

In [39]:
# Training parameters
#epochs = 5
#batch_size = 512
#steps_per_epoch = 1000

#for epoch in range(epochs):
#    epoch_loss = 0
#    for step in range(steps_per_epoch):
#        x_batch, y_batch = generate_batch(pairs, batch_size)
#        loss = model.train_on_batch(x_batch, y_batch)
#        epoch_loss += loss
#    print(f'Epoch {epoch+1}, Loss: {epoch_loss / steps_per_epoch}')


## ***Visualize Word Embeddings In Two Dimensions***

In [40]:
# Get weights
embeddings = np.load('skipgram_embeddings.npy')
embeddings = model.embed.get_weights()[0]

# Pick a few words to visualize
words_to_visualize = ['hate', 'love', 'kill', 'peace', 'fight']
word_ids = [word2idx[w] for w in words_to_visualize if w in word2idx]

for i, word_id in enumerate(word_ids):
    vec = embeddings[word_id]
    plt.scatter(vec[0], vec[1])
    plt.text(vec[0], vec[1], words_to_visualize[i])
plt.title('2D projection (only 2 dims shown)')
plt.show()

IndexError: list index out of range

In [42]:
print(embeddings.shape)
print(embeddings[0])

(10000, 128)
[-0.02848216  0.00400998 -0.03258134 -0.04974265  0.00207678  0.02484879
  0.03113591 -0.00340078  0.00696534 -0.00292827 -0.04801117  0.02816404
 -0.01797894 -0.0195039  -0.01075071 -0.04995915  0.02681008 -0.00748922
 -0.02029742 -0.01303394 -0.03315737 -0.04590987 -0.01173951 -0.04502933
  0.00965339  0.00400305  0.04076019 -0.01129614 -0.01013052  0.03318813
 -0.03019115  0.04862919  0.03768841 -0.0138522   0.04247582 -0.02034836
 -0.0272624   0.02495494  0.02386754 -0.0397911   0.0330441  -0.00070839
  0.03407531 -0.02472837 -0.00889399 -0.00569149  0.0381316   0.03481778
  0.01629115  0.03324561 -0.01976572 -0.03163081 -0.03177797  0.0226329
  0.02272067 -0.01123648  0.04049429  0.00278439 -0.02448131  0.03498909
  0.04240072  0.04180335  0.04316416  0.032443   -0.00954914 -0.04092628
  0.00171892 -0.04406951  0.03671697  0.01783389  0.00139958  0.00457359
 -0.00527228  0.03302393 -0.04062376  0.01317779 -0.02013574  0.02377847
  0.04156587 -0.00905819  0.02947456 -0

## ***Save Embeddings***

In [None]:
#np.save('skipgram_embeddings.npy', embeddings)

## ***Evaluate Word Similarity***

In [43]:
def most_similar(word, top_n=5):
    if word not in word2idx:
        return []
    word_vec = embeddings[word2idx[word]]
    similarities = np.dot(embeddings, word_vec) / (
        np.linalg.norm(embeddings, axis=1) * np.linalg.norm(word_vec) + 1e-10
    )
    nearest = np.argsort(similarities)[-top_n-1:-1][::-1]
    return [(idx2word.get(i, 'UNK'), similarities[i]) for i in nearest]

print(most_similar('hate'))


[('dislik', 0.69293946), ('nofakingg', 0.68656343), ('tiemyduragboo', 0.6669945), ('dumbest', 0.6529794), ('mayormcginn', 0.64533913)]


## ***Prepare Sentence Vectors***

In [44]:
# Function to convert sentence to embedding by averaging word vectors
def sentence_to_vec(sentence, word2idx, embeddings):
    tokens = [word2idx[word] for word in sentence.split() if word in word2idx]
    if not tokens:
        return np.zeros(embeddings.shape[1])  # return zero vector for empty case
    return np.mean(embeddings[tokens], axis=0)


In [45]:
# Apply to all samples
df['vector'] = df['tweet'].apply(lambda x: sentence_to_vec(x, word2idx, embeddings))
X = np.vstack(df['vector'].values)
y = df['class'].values

## ***Split The Data Into Train and Test Sets***

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

## ***Logistic Regression***

In [47]:
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train, y_train)

In [48]:
y_pred = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7908540685944856

Classification Report:
                     precision    recall  f1-score   support

       hate_speech       0.22      0.65      0.32       429
           neither       0.73      0.87      0.80      1249
offensive_language       0.97      0.78      0.87      5757

          accuracy                           0.79      7435
         macro avg       0.64      0.77      0.66      7435
      weighted avg       0.89      0.79      0.82      7435



## ***Support Vector Machine Classifier***

In [49]:
svc = SVC(kernel='linear', class_weight='balanced', probability=True).fit(X_train, y_train)

In [50]:
y_pred = svc.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7905850706119704

Classification Report:
                     precision    recall  f1-score   support

       hate_speech       0.21      0.65      0.32       429
           neither       0.73      0.88      0.80      1249
offensive_language       0.97      0.78      0.87      5757

          accuracy                           0.79      7435
         macro avg       0.64      0.77      0.66      7435
      weighted avg       0.89      0.79      0.82      7435



## ***Gaussian Naive Bais***

In [52]:
gnb_clf = GaussianNB().fit(X_train, y_train)

In [53]:
y_pred = gnb_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8162743779421654

Classification Report:
                     precision    recall  f1-score   support

       hate_speech       0.23      0.46      0.31       429
           neither       0.70      0.78      0.74      1249
offensive_language       0.94      0.85      0.89      5757

          accuracy                           0.82      7435
         macro avg       0.63      0.69      0.65      7435
      weighted avg       0.86      0.82      0.83      7435



## ***Random Forrest***

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [None]:
rf = RandomForestClassifier(random_state=42)

In [None]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

In [None]:
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=kf,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

## ***LSTM (Long short-term memory)***

In [54]:
embedding_matrix = embeddings  # shape: (vocab_size, embedding_dim)
vocab_size, embedding_dim = embedding_matrix.shape

In [55]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))  # remove input_length
model.add(LSTM(units=128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

map_dic = {"hate_speech": 0, "offensive_language": 1, "neither": 2}
y_mapped = np.vectorize(map_dic.get)(y_train)

model.fit(X_train, y_mapped, epochs=5, batch_size=32, validation_split=0.1)
model.summary()

Epoch 1/5
[1m488/488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 125ms/step - accuracy: 0.7720 - loss: -2.4184 - val_accuracy: 0.7862 - val_loss: -5.9948
Epoch 2/5
[1m488/488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 120ms/step - accuracy: 0.7730 - loss: -7.3478 - val_accuracy: 0.7862 - val_loss: -10.4547
Epoch 3/5
[1m488/488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 120ms/step - accuracy: 0.7695 - loss: -12.7683 - val_accuracy: 0.7862 - val_loss: -14.8780
Epoch 4/5
[1m488/488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 118ms/step - accuracy: 0.7764 - loss: -16.9042 - val_accuracy: 0.7862 - val_loss: -19.3305
Epoch 5/5
[1m488/488[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 107ms/step - accuracy: 0.7699 - loss: -22.4744 - val_accuracy: 0.7862 - val_loss: -23.7308


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# New input text
text = ["I hate deepseek generation code"]

# Preprocess: tokenize and pad
seq = sentence_to_vec(text,word2idx,embeddings)
#padded = pad_sequences(seq, maxlen=max_len)  # use same `max_len` used in training

# Predict
pred = model.predict(seq)

# Decode the predicted class
class_names = ["hate_speech", "offensive_language", "neither"]
predicted_class = class_names[np.argmax(pred)]

print(f"Predicted class: {predicted_class}")

## ***XGBoost Classifier***

In [68]:
## ***XGBoost Classifier***
from sklearn.preprocessing import LabelEncoder

# Encode the string labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
from xgboost import XGBClassifier
# Create and train the XGBoost model
xgb_clf = XGBClassifier(
    n_estimators=100,   # Number of boosting rounds
    learning_rate=0.1,  # Step size shrinkage used to prevent overfitting
    max_depth=5,        # Maximum depth of a tree
    random_state=42
)

# Train the model
xgb_clf.fit(X_train, y_train_encoded)

# Make predictions on test set
predicted_encoded = xgb_clf.predict(X_test)
predicted = label_encoder.inverse_transform(predicted_encoded)

# Calculate and display accuracy
print(f"Accuracy: {np.mean(predicted == y_test)}")

Accuracy: 0.8831203765971756
