In [1]:
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer




In [2]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [3]:
st=stopwords.words('english')
stem=PorterStemmer()
def stemming(txt):
    txt=txt.split()
    txt=[stem.stem(x) for x in txt if x not in st]
    return ' '.join(txt)
def preprocess(txt):
    txt=re.sub('@[A-Za-z0-9]+',' ',txt)
    txt=re.sub('[^a-z\s]',' ',txt.lower())
    txt=re.sub('\s+',' ',txt)
    
    return txt


In [4]:
df=pd.read_csv('train_hate_speech.csv')


In [5]:
df=df[['tweet','class']]
df['class'].value_counts()
df['pre_process']=df['tweet'].apply(preprocess)
df['pre_process']=df['pre_process'].map(stemming)

In [6]:
tok=Tokenizer(oov_token='unknown')
tok.fit_on_texts(df['pre_process'])
train_x,test_x,train_y,test_y=train_test_split(df['pre_process'],df['class'],stratify=df['class'])


In [7]:
trainx=tok.texts_to_sequences(train_x)
testx=tok.texts_to_sequences(test_x)

train_pad=pad_sequences(trainx,maxlen=15)
test_pad=pad_sequences(testx,maxlen=15)


In [8]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(train_pad,train_y)


In [9]:
from sklearn.metrics import accuracy_score,confusion_matrix


In [10]:
ans=knn.predict(test_pad)


In [11]:
accuracy_score(test_y,ans)


0.7612976113621691

In [12]:
print(confusion_matrix(test_y,ans))


[[  12  330   15]
 [  56 4580  162]
 [  18  898  125]]


In [13]:
from tensorflow.keras.layers import Dense,Embedding,Flatten
from tensorflow.keras.models import Sequential


In [14]:
m=Sequential()
m.add(Embedding(input_dim=24784,input_length=15,output_dim=64))
m.add(Dense(32))
m.add(Flatten())
m.add(Dense(3,activation='softmax'))
m.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
m.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 64)            1586176   
                                                                 
 dense (Dense)               (None, 15, 32)            2080      
                                                                 
 flatten (Flatten)           (None, 480)               0         
                                                                 
 dense_1 (Dense)             (None, 3)                 1443      
                                                                 
Total params: 1589699 (6.06 MB)
Trainable params: 1589699 (6.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
hist=m.fit(train_pad,train_y,batch_size=64,epochs=5)


Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
ans1=m.predict(test_pad)
ans1=np.argmax(ans1,axis=1)



In [17]:
print('With ANN score {}'.format(accuracy_score(test_y,ans1)*100))
print('With KNN score {}'.format(accuracy_score(test_y,ans)*100))

With ANN score 87.50806972240154
With KNN score 76.12976113621691
