In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re
import pickle

# Importing and cleaning data
Here I deleted neutral posts and changed categury's values so that it would be easier for sigmoid activation function.

In [2]:
data = pd.read_csv('Reddit_data.csv')
data = data[data.category != 0]
data["clean_comment"] = data["clean_comment"].astype(str)
data.category = data.category.map({ 1 : 1, -1 : 0})
data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,0
4,for your own benefit you may want read living ...,1
5,you should all sit down together and watch the...,0


In [3]:
X = data["clean_comment"]
y = data["category"]

# Preprocessing data
First we decapitalize all words and leave only proper words. Then we turn 5000 words into numeric values and pad texts so they are all the same length.

In [4]:

X = X.apply(lambda x: x.lower())
X = X.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [5]:

tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(X.values)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, maxlen = 256)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
with open("tokenizer.pickle", "wb") as tok:
    pickle.dump(tokenizer, tok)

In [6]:
model = keras.Sequential([
    keras.layers.Embedding(5000, 128, input_length = X.shape[1]),
    keras.layers.SpatialDropout1D(0.4),
    keras.layers.LSTM(128, dropout = 0.2, recurrent_dropout = 0.2),
    keras.layers.Dense(1, activation = "sigmoid")
])

In [7]:
model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=['accuracy'])
model.fit(X_train, y_train, epochs = 10, validation_split = 0.33, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2628499f198>

In [8]:
model.evaluate(X_test, y_test)



[0.5306383371353149, 0.8676897287368774]

In [19]:
post = ["I hate this nonsense"]
post = tokenizer.texts_to_sequences(post)
post = pad_sequences(post, maxlen=28, dtype='int32', value=0)
print(post)
prediction = model.predict(post)
round(float(prediction))


[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0  192    4 1603]]


0

In [23]:
post2 = ["This is super amazing"]
post2 = tokenizer.texts_to_sequences(post2)
post2 = pad_sequences(post2, maxlen=28, dtype='int32', value=0)
print(post2)
prediction2 = model.predict(post2)
round(float(prediction2))

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   4 890 662]]


1

In [36]:
model.save("Model.h5")