## ***Importation des bibliothèques***

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfTransformer
import html
import re

## ***Exploration des données***

In [2]:
df = pd.read_csv("labeled_data.csv")

In [3]:
df.drop("Unnamed: 0", axis=1,inplace=True)

In [4]:
df.shape

(24783, 6)

In [5]:
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
df.tail()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
24778,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,6,0,6,0,1,youu got wild bitches tellin you lies
24782,3,0,0,3,2,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   count               24783 non-null  int64 
 1   hate_speech         24783 non-null  int64 
 2   offensive_language  24783 non-null  int64 
 3   neither             24783 non-null  int64 
 4   class               24783 non-null  int64 
 5   tweet               24783 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.1+ MB


In [8]:
df.drop(["hate_speech","offensive_language","neither"],axis=1,inplace=True)

In [9]:
df.head()

Unnamed: 0,count,class,tweet
0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [10]:
map_dic = {0:"hate_speech",1:"offensive_language",2:"neither"}
df["class"]=df["class"].map(map_dic)

In [11]:
df.head()

Unnamed: 0,count,class,tweet
0,3,neither,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,offensive_language,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,offensive_language,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,offensive_language,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,offensive_language,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


## ***Text Preprocessing***

### **Removing Stopwords**

In [12]:
def process(message):
    nopunc = [char for char in message if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    return ' '.join(clean)

In [13]:
df["tweet"] = df["tweet"].apply(process)

In [14]:
def clean_tweet(tweet):
    # Unescape HTML entities (e.g., &amp; -> &, &#128536; -> emoji)
    tweet = html.unescape(tweet)
    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove hashtags (#hashtag)
    tweet = re.sub(r'#\w+', '', tweet)
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    # Remove emojis and non-ASCII characters
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')
    # Remove special characters, punctuation (except sentence enders), and digits
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)
    # Collapse multiple spaces into one
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

In [15]:
df['tweet'] = df['tweet'].apply(clean_tweet)

In [16]:
df.head()

Unnamed: 0,count,class,tweet
0,3,neither,RT mayasolovely woman shouldnt complain cleani...
1,3,offensive_language,RT mleew boy dats coldtyga dwn bad cuffin dat ...
2,3,offensive_language,RT UrKindOfBrand Dawg RT sbabylife ever fuck b...
3,3,offensive_language,RT CGAnderson vivabased look like tranny
4,6,offensive_language,RT ShenikaRoberts shit hear might true might f...


In [17]:
df["tweet"] = df["tweet"].str.lower()

### **Stemming**

In [18]:
porter = nltk.PorterStemmer()
def stem(message):
    stemmedArr = [porter.stem(term) for term in message.split(" ")]
    return ' '.join(stemmedArr)

In [19]:
df["tweet"]=df["tweet"].apply(stem)

In [20]:
df.tail()

Unnamed: 0,count,class,tweet
24778,3,offensive_language,you muthafin lie lifeask pearl coreyemanuel ri...
24779,3,neither,youv gone broke wrong heart babi drove redneck...
24780,3,offensive_language,young buck wanna eat dat nigguh like aint fuck...
24781,6,offensive_language,youu got wild bitch tellin lie
24782,3,neither,ruffl ntac eileen dahlia beauti color combin p...


## ***Feature Extraction***

In [21]:
X = df.drop("class",axis=1)
y = df["class"]

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)


In [23]:
count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train.tweet)

In [24]:
X_train_count.shape

(17348, 22749)

In [25]:
X_test_count = count_vect.transform(X_test.tweet)

In [26]:
X_test_count.shape

(7435, 22749)

## ***TF-IDF (term frequency-inverse document frequency)***

In [27]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
X_train_tfidf.shape

(17348, 22749)

In [28]:
X_test_tfidf = tfidf_transformer.transform(X_test_count)
X_test_tfidf.shape

(7435, 22749)

## ***Multinomial Naive Bais***

In [29]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [30]:
tweet_test = ["What a beautifull day summer"]
tweet_test_count = count_vect.transform(tweet_test)
tweet_test_tfidf = tfidf_transformer.transform(tweet_test_count)
predicted = clf.predict(tweet_test_tfidf)

In [31]:
predicted

array(['offensive_language'], dtype='<U18')

In [32]:
predicted = clf.predict(X_test_tfidf)
np.mean(predicted == y_test)

0.792871553463349

## ***Logistic Regression***

In [33]:
reg_clf = LogisticRegression(max_iter=1000, class_weight='balanced').fit(X_train_tfidf, y_train)

In [34]:
predicted = reg_clf.predict(X_test_tfidf)
np.mean(predicted == y_test)

0.8572965702757229

In [53]:
tweet_test = ["I don't like black people"]
tweet_test_count = count_vect.transform(tweet_test)
tweet_test_tfidf = tfidf_transformer.transform(tweet_test_count)
predicted = reg_clf.predict(tweet_test_tfidf)
predicted

array(['hate_speech'], dtype=object)

## ***Support Vector Machine Classifier***

In [36]:
svc_clf = SVC(kernel='linear', class_weight='balanced', probability=True).fit(X_train_tfidf, y_train)

In [37]:
predicted = svc_clf.predict(X_test_tfidf)
np.mean(predicted == y_test)

0.8706119704102219

In [38]:
tweet_test = ["youu got wild bitch tellin lie"]
tweet_test_count = count_vect.transform(tweet_test)
tweet_test_tfidf = tfidf_transformer.transform(tweet_test_count)
predicted = svc_clf.predict(tweet_test_tfidf)
print(predicted[0])

offensive_language


## ***Gaussiane Naive Bias***

In [39]:
gnb_clf = GaussianNB().fit(X_train_tfidf.toarray(), y_train)

In [40]:
predicted = gnb_clf.predict(X_test_tfidf.toarray())
np.mean(predicted == y_test)

0.5047747141896436