<a href="https://colab.research.google.com/github/le-Mon94/HateOffensiveSpeech-Detector-FromAudio/blob/main/Detector_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Train Model sklearn linearSVC**

## **Importing Libraries and stuff**

In [1]:
# Import LinearSVC Stuff

import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm_notebook
from sklearn.model_selection import train_test_split

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **Dataset**

In [2]:
# Importing Dataset

url = 'https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv'
df = pd.read_csv(url)
del df['Unnamed: 0']

### **Downsampling**

In [3]:
import random

original_offensive_df = df[df['class'] == 1]

# Get the indices of examples to be removed
num_examples_to_remove = len(original_offensive_df) - 1200
indices_to_remove = random.sample(list(original_offensive_df.index), num_examples_to_remove)

# Remove the selected examples
downsampled_offensive_df = original_offensive_df.drop(indices_to_remove)

# Update the original DataFrame with the downsampled class 1 data
df = pd.concat([df[df['class'] != 1], downsampled_offensive_df])

# Shuffle the DataFrame to ensure randomness
df = df.sample(frac=1).reset_index(drop=True)

### **Dataset**

In [4]:
df.head(20)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,6,1,1,4,2,Conan Obrien needs to get a life with that $80...
1,3,2,1,0,0,Bitch you're the ugliest cunt ever.
2,3,0,0,3,2,@simonslives @recordstoreday cheers dude might...
3,3,2,1,0,0,"@leetreble_ quit bitching faggot, happy you ca..."
4,3,0,0,3,2,RT @jpayneancap: @A_M_Perez @sybilll @DianneWi...
5,3,0,3,0,1,"She like ""bitch you know I got next, yasss"" ht..."
6,3,0,0,3,2,"Charlie Crist, outlawed, hated, rediculed and ..."
7,3,0,0,3,2,RT @ConservativeLA: Let the Dem concern-trolli...
8,3,0,1,2,2,Even though Lil Wayne's music has turned into ...
9,3,0,3,0,1,RT @heypoopbutt: Show me your pussy.


In [5]:
class_counts = df['class'].value_counts()

print(class_counts)

2    4163
0    1430
1    1200
Name: class, dtype: int64


## **Fitting and Training**

In [6]:
x = df['tweet']
y = df['class']

In [7]:
def process_text(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in word_tokenize(nopunc) if word and not re.search(pattern=r"\s+", string=word)]

def split_text(text):
    return text.split()

train_splits = []

for i in tqdm_notebook(range(df.shape[0])):
    train_splits.append(' '.join(process_text(df['tweet'][i])))

  0%|          | 0/6793 [00:00<?, ?it/s]

In [8]:
x = train_splits
y = df['class']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train_encoded, y_test_encoded = train_test_split(x, y_encoded, test_size=0.35, random_state=42)

In [9]:
# Define the text classification pipeline

Model = Pipeline([
    ('vect', CountVectorizer(tokenizer=split_text, ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

# Fit the pipeline to your data

Model.fit(x_train, y_train_encoded)



In [10]:
# Output and Training everything

label_mapping = {0: 'hate', 1: 'offensive', 2: 'neither'}

pred_labels_encoded = Model.predict(x_test)

print(x_test)

print(pred_labels_encoded)

pred_labels = label_encoder.inverse_transform(pred_labels_encoded)

submit_df = pd.DataFrame({'text': x_test, 'predicted_label': pred_labels})

submit_df['class'] = submit_df['predicted_label'].map(label_mapping)

submit_df.to_csv('predictions.csv', index=False)

new_df = pd.read_csv('predictions.csv')
new_df.head(20)

[2 0 1 ... 2 2 2]


Unnamed: 0,text,predicted_label,class
0,fuzzy socks are the actual life,2,neither
1,People are fucking retarded lol,0,hate
2,MikeTattaglia happy birthday I hope the bitche...,1,offensive
3,RT ABCbirds1 Neonics are deadly to birds like ...,2,neither
4,MeowMariss i thought i was the ghetto word for...,2,neither
5,Po0pdogg back up faggot,0,hate
6,jaaaaaade You cant see it but Im flipping you ...,2,neither
7,RT ooxlexx I cant stand to be around ghetto gi...,2,neither
8,I thought she was albino tho 128563,2,neither
9,Fat ass inbred rednecks like Paula Deen are br...,0,hate


## **Classification Report**

In [11]:
y_hat = Model.predict(x_train)
report = classification_report(y_train_encoded, y_hat)

conf_matrix = confusion_matrix(y_train_encoded, y_hat)

# Compute accuracy
accuracy = accuracy_score(y_train_encoded, y_hat)

# Print the classification report and other metrics
print("Classification Report:")
print(report)

print("\nConfusion Matrix:")
print(conf_matrix)

print("\nAccuracy: {:.4f}".format(accuracy))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       918
           1       1.00      1.00      1.00       765
           2       1.00      1.00      1.00      2732

    accuracy                           1.00      4415
   macro avg       1.00      1.00      1.00      4415
weighted avg       1.00      1.00      1.00      4415


Confusion Matrix:
[[ 918    0    0]
 [   0  765    0]
 [   0    0 2732]]

Accuracy: 1.0000


In [12]:
y_hat = Model.predict(x_test)
report = classification_report(y_test_encoded, y_hat)

conf_matrix = confusion_matrix(y_test_encoded, y_hat)

# Compute accuracy
accuracy = accuracy_score(y_test_encoded, y_hat)

# Print the classification report and other metrics
print("Classification Report:")
print(report)

print("\nConfusion Matrix:")
print(conf_matrix)

print("\nAccuracy: {:.4f}".format(accuracy))

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.57      0.67       512
           1       0.74      0.74      0.74       435
           2       0.89      0.98      0.93      1431

    accuracy                           0.85      2378
   macro avg       0.81      0.77      0.78      2378
weighted avg       0.85      0.85      0.84      2378


Confusion Matrix:
[[ 292  102  118]
 [  53  324   58]
 [  14   12 1405]]

Accuracy: 0.8499
