## ***Importation des bibliothèques***

In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix

# Add imports for Word2Vec
import gensim
from gensim.models import Word2Vec

In [2]:
# # Install XGBoost
# %pip install xgboost

# # Install TensorFlow
%pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


## ***Exploration des données***

In [3]:
df = pd.read_csv('/Users/hamzacharmaqe/Documents/ENSAM/S2/TextMining/Project/labeled_data.csv')

In [4]:
df.drop("Unnamed: 0", axis=1,inplace=True)

In [5]:
df.shape

(24783, 6)

In [6]:
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [7]:
df.tail()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
24778,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,6,0,6,0,1,youu got wild bitches tellin you lies
24782,3,0,0,3,2,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   count               24783 non-null  int64 
 1   hate_speech         24783 non-null  int64 
 2   offensive_language  24783 non-null  int64 
 3   neither             24783 non-null  int64 
 4   class               24783 non-null  int64 
 5   tweet               24783 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.1+ MB


In [9]:
df.drop(["hate_speech","offensive_language","neither"],axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,count,class,tweet
0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [11]:
map_dic = {0:"hate_speech",1:"offensive_language",2:"neither"}
df["class"]=df["class"].map(map_dic)

In [12]:
df.head()

Unnamed: 0,count,class,tweet
0,3,neither,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,offensive_language,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,offensive_language,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,offensive_language,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,offensive_language,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


## ***Text Preprocessing***

### **Removing Stopwords**

In [13]:
def process(message):
    nopunc = [char for char in message if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    return ' '.join(clean)

In [14]:
#nltk.download('stopwords')
df["tweet"] = df["tweet"].apply(process)

In [15]:
df.head()

Unnamed: 0,count,class,tweet
0,3,neither,RT mayasolovely woman shouldnt complain cleani...
1,3,offensive_language,RT mleew17 boy dats coldtyga dwn bad cuffin da...
2,3,offensive_language,RT UrKindOfBrand Dawg RT 80sbaby4life ever fuc...
3,3,offensive_language,RT CGAnderson vivabased look like tranny
4,6,offensive_language,RT ShenikaRoberts shit hear might true might f...


In [16]:
df["tweet"] = df["tweet"].str.lower()

### **Stemming**

In [17]:
porter = nltk.PorterStemmer()
def stem(message):
    stemmedArr = [porter.stem(term) for term in message.split(" ")]
    return ' '.join(stemmedArr)

In [18]:
df["tweet"]=df["tweet"].apply(stem)

In [19]:
df.tail()

Unnamed: 0,count,class,tweet
24778,3,offensive_language,you muthafin lie 8220lifeask 20pearl coreyeman...
24779,3,neither,youv gone broke wrong heart babi drove redneck...
24780,3,offensive_language,young buck wanna eat dat nigguh like aint fuck...
24781,6,offensive_language,youu got wild bitch tellin lie
24782,3,neither,ruffl ntac eileen dahlia beauti color combin p...


## ***Feature Extraction***

In [20]:
#X = df.drop("class",axis=1)
#y = df["class"]

In [21]:
#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

train_set, test_set = train_test_split(df, test_size=0.3, random_state=42)

In [22]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(df, df["class"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [23]:
X_strat_train_set = strat_train_set.drop("class",axis=1)
y_strat_train_set = strat_train_set["class"]
X_strat_test_set = strat_test_set.drop("class",axis=1)
y_strat_test_set = strat_test_set["class"]

In [24]:
count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_strat_train_set.tweet)

In [25]:
X_train_count.shape

(17348, 28160)

In [26]:
X_test_count = count_vect.transform(X_strat_test_set.tweet)

In [27]:
X_test_count.shape

(7435, 28160)

## ***Continuous Bag of Words (CBOW)***

In [28]:
# Replace the "TF-IDF (term frequency-inverse document frequency)" section with:

## ***Continuous Bag of Words (CBOW)***

# First, we need to tokenize our tweets to prepare for CBOW
def tokenize_tweet(tweet):
    return tweet.split()

# Tokenize the training and test set tweets
X_strat_train_set['tokens'] = X_strat_train_set['tweet'].apply(tokenize_tweet)
X_strat_test_set['tokens'] = X_strat_test_set['tweet'].apply(tokenize_tweet)

# Create and train CBOW model (Word2Vec with sg=0)
# sg=0 means CBOW model, sg=1 would be Skip-gram
cbow_model = Word2Vec(
    sentences=X_strat_train_set['tokens'],
    vector_size=100,    # Dimension of word vectors
    window=5,           # Context window size
    min_count=1,        # Ignore words that appear less than this
    workers=4,          # Number of processors to use
    sg=0                # 0 = CBOW, 1 = Skip-gram
)

print(f"Vocabulary size: {len(cbow_model.wv.key_to_index)}")

# Function to create document vectors by averaging word vectors
def document_vector(tokens, model):
    # Filter tokens that are in the vocabulary
    valid_tokens = [token for token in tokens if token in model.wv]
    
    if not valid_tokens:
        # Return zeros if no tokens are in vocabulary
        return np.zeros(model.vector_size)
    
    # Calculate the mean of all word vectors in the document
    return np.mean([model.wv[token] for token in valid_tokens], axis=0)

# Create document vectors for training and test sets
X_train_cbow = np.array([document_vector(tokens, cbow_model) for tokens in X_strat_train_set['tokens']])
X_test_cbow = np.array([document_vector(tokens, cbow_model) for tokens in X_strat_test_set['tokens']])

print(f"Training vectors shape: {X_train_cbow.shape}")
print(f"Test vectors shape: {X_test_cbow.shape}")

Vocabulary size: 28189
Training vectors shape: (17348, 100)
Test vectors shape: (7435, 100)


## ***Multinomial Naive Bais Model***

In [29]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB().fit(X_train_cbow, y_strat_train_set)

# For testing with new tweets
def preprocess_and_vectorize(tweet_text):
    processed = process(tweet_text)
    processed = processed.lower()
    processed = stem(processed)
    tokens = processed.split()
    return document_vector(tokens, cbow_model).reshape(1, -1)

In [30]:
tweet_test = ["What a beautifull day summer"]
tweet_test_vector = preprocess_and_vectorize(tweet_test[0])
predicted = clf.predict(tweet_test_vector)

In [31]:
print(predicted)

['offensive_language']


In [32]:
predicted = clf.predict(X_test_cbow)
print(f"Accuracy: {np.mean(predicted == y_strat_test_set)}")

Accuracy: 0.714862138533961


## ***Logistic Regression Model***

In [33]:
reg_clf = LogisticRegression().fit(X_train_cbow, y_strat_train_set)

In [34]:
predicted = reg_clf.predict(X_test_cbow)
print(f"Accuracy: {np.mean(predicted == y_strat_test_set)}")

Accuracy: 0.8329522528581036


## ***Support Vector Machine Classifier***

In [35]:
svc_clf = SVC().fit(X_train_cbow, y_strat_train_set)

In [36]:
predicted = svc_clf.predict(X_test_cbow)
print(f"Accuracy: {np.mean(predicted == y_strat_test_set)}")

Accuracy: 0.7839946200403497


## ***Gaussiane Naive Bias***

In [37]:
gnb_clf = GaussianNB().fit(X_train_cbow, y_strat_train_set)

In [38]:
predicted = gnb_clf.predict(X_test_cbow)
print(f"Accuracy: {np.mean(predicted == y_strat_test_set)}")

Accuracy: 0.714862138533961


### ***Random Forest Classifier***


In [39]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest model
rf_clf = RandomForestClassifier(
    n_estimators=200,  # Number of trees in the forest
    max_depth=None,    # Maximum depth of the trees
    min_samples_split=5,
    random_state=42
)

# Train the model
rf_clf.fit(X_train_cbow, y_strat_train_set)

# Make predictions on test set
predicted = rf_clf.predict(X_test_cbow)

# Calculate and display accuracy
print(f"Accuracy: {np.mean(predicted == y_strat_test_set)}")

Accuracy: 0.8425016812373907


## ***XGBoost Classifier***

In [44]:
# %pip install xgboost


In [45]:
## ***XGBoost Classifier***

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Encode the string labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_strat_train_set)
y_test_encoded = label_encoder.transform(y_strat_test_set)

# Create and train the XGBoost model
xgb_clf = XGBClassifier(
    n_estimators=100,   # Number of boosting rounds
    learning_rate=0.1,  # Step size shrinkage used to prevent overfitting
    max_depth=5,        # Maximum depth of a tree
    random_state=42
)

# Train the model
xgb_clf.fit(X_train_cbow, y_train_encoded)

# Make predictions on test set
predicted_encoded = xgb_clf.predict(X_test_cbow)
predicted = label_encoder.inverse_transform(predicted_encoded)

# Calculate and display accuracy
print(f"Accuracy: {np.mean(predicted == y_strat_test_set)}")

Accuracy: 0.8443846671149966


## ***Convolutional Neural Network (CNN)***

In [47]:
## ***Convolutional Neural Network (CNN)***

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_strat_train_set)
y_test_encoded = le.transform(y_strat_test_set)
num_classes = len(le.classes_)
y_train_categorical = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_categorical = to_categorical(y_test_encoded, num_classes=num_classes)

# Create a simple neural network (instead of CNN since our data is already vectorized)
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_cbow.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(
    X_train_cbow, 
    y_train_categorical,
    epochs=5,
    batch_size=32,
    verbose=1
)

# Evaluate model
loss, accuracy = model.evaluate(X_test_cbow, y_test_categorical, verbose=0)
print(f"Neural Network Accuracy: {accuracy:.4f}")

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 577us/step - accuracy: 0.7553 - loss: 0.6585
Epoch 2/5
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 537us/step - accuracy: 0.7885 - loss: 0.5838
Epoch 3/5
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 559us/step - accuracy: 0.8088 - loss: 0.5404
Epoch 4/5
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 620us/step - accuracy: 0.8246 - loss: 0.5083
Epoch 5/5
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499us/step - accuracy: 0.8236 - loss: 0.5041
Neural Network Accuracy: 0.8272
