<a href="https://colab.research.google.com/github/hate-speech-classification/implementation/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Init
"""
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
"""
Read data from csv (url from github)
"""
url_train_data = 'https://raw.githubusercontent.com/amandacurry/convabuse/main/2_splits/ConvAbuseEMNLPtrain.csv'
df = pd.read_csv(url_train_data)

"""
Create new column labels
"""
def categorise(row):
  if row['Annotator1_is_abuse.1'] == '1':
    return 'Not abusive'
  elif row['Annotator1_is_abuse.0'] == '1':
    return "Ambiguous"
  elif row['Annotator1_is_abuse.-1'] == '1':
    return "Mildly abusive"
  elif row['Annotator1_is_abuse.-2'] == '1':
    return "Strongly abusive"
  elif row['Annotator1_is_abuse.-3'] == '1':
    return "Very strongly abusive"

my_tags = ['Not abusive', 'Ambiguous', 'Midly abusive', 'Strongly abusive', 'Very strongly abusive']
df['abusive_level'] = df.apply(lambda row: categorise(row), axis=1) 
df = df[pd.notnull(df['abusive_level'])]
# df.tail(10)

In [None]:
"""
Clean text (user column)
"""
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['user'] = df['user'].apply(clean_text)
data = df[['user','abusive_level']]
X = data.user
Y = data.abusive_level
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)
# data.tail(10)

In [None]:
X_train.values[0]

'good'

In [None]:
"""
Naive Bayes Classifier 
Idea: text is categorized based on TF*IDF (term frequency * inverse data frequency)
"""

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

# %%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags,zero_division=1))

accuracy 0.786096256684492
                       precision    recall  f1-score   support

          Not abusive       1.00      0.00      0.00         6
            Ambiguous       1.00      0.00      0.00        16
        Midly abusive       0.79      1.00      0.88       144
     Strongly abusive       0.60      0.20      0.30        15
Very strongly abusive       1.00      0.00      0.00         6

             accuracy                           0.79       187
            macro avg       0.88      0.24      0.24       187
         weighted avg       0.81      0.79      0.70       187



In [None]:
my_text = clean_text("FUck it")
pr = nb.predict(my_text)

ValueError: ignored

In [None]:
import tensorflow as tf
X_t = tf.constant(X_test)
y_t = tf.constant(y_test)

X_t.shape, y_t.shape

(TensorShape([100, 2]), TensorShape([100]))

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.datasets import make_blobs
X_t.values, y_t.valuse = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu');

KeyError: ignored

In [None]:
"""
Linear Support Vector Machine
Idea: text is categorized based on TF*IDF (term frequency * inverse data frequency)
"""

from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

# %%time

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags,zero_division=1))

accuracy 0.8021390374331551
                       precision    recall  f1-score   support

          Not abusive       0.00      0.00      0.00         6
            Ambiguous       0.33      0.12      0.18        16
        Midly abusive       0.85      0.97      0.90       144
     Strongly abusive       0.57      0.53      0.55        15
Very strongly abusive       0.50      0.17      0.25         6

             accuracy                           0.80       187
            macro avg       0.45      0.36      0.38       187
         weighted avg       0.74      0.80      0.76       187



In [None]:
"""
Logistic Regression
Idea: text is categorized based on TF*IDF (term frequency * inverse data frequency)
"""


from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=100000)),
               ])
logreg.fit(X_train, y_train)

# %%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags,zero_division=1))

accuracy 0.7914438502673797
                       precision    recall  f1-score   support

          Not abusive       0.00      0.00      0.00         6
            Ambiguous       0.31      0.25      0.28        16
        Midly abusive       0.86      0.96      0.90       144
     Strongly abusive       0.62      0.33      0.43        15
Very strongly abusive       0.50      0.17      0.25         6

             accuracy                           0.79       187
            macro avg       0.46      0.34      0.37       187
         weighted avg       0.75      0.79      0.76       187



In [None]:
X_train

1767                      good
2497                      fuck
121                     london
2                          yea
2482                     speak
                 ...          
262                   okey got
405                      tired
1060                   colombo
1726                 nevermind
390     saint louis ok goodbye
Name: user, Length: 435, dtype: object

In [6]:
!pip install SpeechRecognition pydub

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 1.3 MB/s 
[?25hCollecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: SpeechRecognition, pydub
Successfully installed SpeechRecognition-3.8.1 pydub-0.25.1


In [7]:
import speech_recognition as sr

def speech_conferter(fil):
  r = sr.Recognizer()
  # open the file
  with sr.AudioFile(fil) as source:
    # listen for the data (load audio to memory)
      audio_data = r.record(source)
    # recognize (convert from speech to text)
      text = r.recognize_google(audio_data)
      return text

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
fil = "drive/My Drive/Colab Notebooks/a.wav"
text = speech_conferter(fil)
text

"I believe you're just talking nonsense"

In [1]:
!pip install scipy



In [3]:
!pip install sounddevice

Collecting sounddevice
  Downloading sounddevice-0.4.4-py3-none-any.whl (31 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.4.4


In [8]:
import sounddevice as sd
from scipy.io.wavfile import write

OSError: ignored

In [None]:
def aud_inp():
  fs = 44100  # Sample rate
  seconds = 3  # Duration of recording

  myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
  sd.wait()  # Wait until recording is finished
  write('output.wav', fs, myrecording)  # Save as WAV file 

  
