In [1]:
# pip install tensorflow
# pip install nltk

In [1]:
import tensorflow as tf
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.casual import TweetTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings

warnings.simplefilter("ignore")
warnings.simplefilter("ignore", category=FutureWarning)

2022-12-14 15:04:11.419311: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-14 15:04:11.541293: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-14 15:04:11.541317: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-14 15:04:12.186152: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [43]:
class DataPreprocessor:
    def __init__(self):
        self.stop_word = set(stopwords.words('english'))
        stop_word_symbol = {"…", "’", ":", '"', '-', '️', '&', '“', '(', '/', "'", ";", "+", "*", "~"}
        self.stop_word.update(stop_word_symbol)
        
        self.tokenizer = TweetTokenizer(reduce_len=True)
        self.lemmatizer = WordNetLemmatizer()
        
        self.total_cnt = 0
        self.max_len = 100
        
        self.regex = "RT (@[A-Za-z0-9_]+)|(@[A-Za-z0-9_]+)|https\S+|http\S+|(?<!\d)[.,;:!?](?!\d)"
        
    def preprocess(self, data_file, test_file):  
        data = pd.read_csv(data_file)
        test_data = pd.read_csv(test_file)
        data.drop_duplicates(subset=['tweet'], inplace=True)
        test_data.drop_duplicates(subset=['tweet'], inplace=True)
        
        data['tweet'] = data['tweet'].str.replace(self.regex, "")
        test_data['tweet'] = test_data['tweet'].str.replace(self.regex, "")
        
        data['tokenized'] = data['tweet'].apply(lambda x: [self.lemmatizer.lemmatize(word) for word in self.tokenizer.tokenize(x.lower()) if word not in self.stop_word])
        test_data['tokenized'] = test_data['tweet'].apply(lambda x: [self.lemmatizer.lemmatize(word) for word in self.tokenizer.tokenize(x.lower()) if word not in self.stop_word])
    
        X_data, Y_data = data['tokenized'].values, data['label'].values
        X_test, Y_test = test_data['tokenized'].values, test_data['label'].values
        
        tk = Tokenizer()
        tk.fit_on_texts(X_data)
        self.total_cnt = len(tk.word_index)
        
        self.tk_oov = Tokenizer(self.total_cnt, oov_token='OOV')
        self.tk_oov.fit_on_texts(X_data)
        X_data = self.tk_oov.texts_to_sequences(X_data)
        X_test = self.tk_oov.texts_to_sequences(X_test)
        
        X_data = pad_sequences(X_data, maxlen=self.max_len)
        X_test = pad_sequences(X_test, maxlen=self.max_len)

        return X_data, Y_data, X_test, Y_test, self.total_cnt
    
    def preprocess_sentence(self, sentence):
        col = ['tweet']
        X_df = pd.DataFrame([sentence], columns=col)

        X_df = X_df['tweet'].str.replace(self.regex, "")

        X_df['tokenized'] = X_df.apply(lambda x: [self.lemmatizer.lemmatize(word) for word in self.tokenizer.tokenize(x.lower()) if word not in self.stop_word])

        X = X_df['tokenized'].values
        
        X = self.tk_oov.texts_to_sequences(X)
        X = pad_sequences(X, maxlen=self.max_len)
        
        return X

In [44]:
dp = DataPreprocessor()
X_data, Y_data, X_test, Y_test, total_cnt = dp.preprocess("./TweetBLM.csv", "./crawl/tweets-new.csv")

In [45]:
from tensorflow.keras.layers import Embedding, Dense, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import regularizers

In [46]:
reg_coeff = 0.001 # options: 0.01, 0.1, 0, 1, 10

# create the model
model = Sequential()
model.add(Embedding(total_cnt, 100))
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid',
                kernel_regularizer=regularizers.l1(reg_coeff), 
                bias_regularizer=regularizers.l2(reg_coeff)))

# callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=8)
mc = ModelCheckpoint('best_GRU.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [47]:
# train the model on the dataset
history = model.fit(X_data, Y_data, epochs=20, callbacks=[es, mc], batch_size=64, validation_split=0.2)

GRU_model = load_model('best_GRU.h5')

# evaluate the model on the test data
results = GRU_model.evaluate(X_test, Y_test)

Epoch 1/20
Epoch 1: val_acc improved from -inf to 0.07474, saving model to best_GRU.h5
Epoch 2/20
Epoch 2: val_acc improved from 0.07474 to 0.38789, saving model to best_GRU.h5
Epoch 3/20
Epoch 3: val_acc improved from 0.38789 to 0.40153, saving model to best_GRU.h5
Epoch 4/20
Epoch 4: val_acc improved from 0.40153 to 0.40316, saving model to best_GRU.h5
Epoch 5/20
Epoch 5: val_acc did not improve from 0.40316
Epoch 6/20
Epoch 6: val_acc did not improve from 0.40316
Epoch 7/20
Epoch 7: val_acc did not improve from 0.40316
Epoch 8/20
Epoch 8: val_acc improved from 0.40316 to 0.42390, saving model to best_GRU.h5
Epoch 9/20
Epoch 9: val_acc did not improve from 0.42390
Epoch 10/20
Epoch 10: val_acc did not improve from 0.42390
Epoch 10: early stopping


In [48]:
print('Test loss: %.4f' % results[0])
print('Test accuracy: %.2f' % (results[1]*100))

Test loss: 1.0806
Test accuracy: 73.82


In [90]:
bad = dp.preprocess_sentence("FUCK!!! It's terrible. Boring. Tired. Worst ever.")
good = dp.preprocess_sentence("LOVE. Like. happy. happiness. peaceful.")
neutral = dp.preprocess_sentence("bitch")

for b in GRU_model.predict(bad): print(b)
for g in GRU_model.predict(good): print(g)
for n in GRU_model.predict(neutral): print(n)

[0.9096677]
[0.00550769]
[0.42200926]


In [109]:
test_se = dp.preprocess_sentence("gross dislike hate you")
# test_se = dp.preprocess_sentence("")
GRU_model.predict(test_se)



array([[0.9158893]], dtype=float32)