In [1]:
# !pip install tensorflow
# !pip install nltk
# !pip install scikit-learn

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.casual import TweetTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import seaborn as sns
from scipy.stats import pearsonr
import ast

2022-12-15 14:09:57.297646: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-15 14:09:57.422119: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-15 14:09:57.422142: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-15 14:09:58.139337: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [4]:
class DataPreprocessor:
    def __init__(self):
        self.tokenizer = TweetTokenizer(reduce_len=True)
        self.lemmatizer = WordNetLemmatizer()
        
        self.regex = "RT (@[A-Za-z0-9_]+)|(@[A-Za-z0-9_]+)|https\S+|http\S+|(?<!\d)[.,;:!?](?!\d)"
        self.emoji_dict = None
        self.stop_word = None
        
        self.tweet_vec = CountVectorizer(tokenizer=self.tokenize_lemmatize)
        self.emoji_vec = CountVectorizer()
        
        self.make_stop_word()
    
    def make_stop_word(self):
        self.stop_word = set(stopwords.words('english'))
        stop_word_symbol = {"…", "’", ":", '"', '-', '️', '&', '“', '(', '/', "'", ";", "+", "*", "~"}
        self.stop_word.update(stop_word_symbol)
    
    def tokenize(self, text): # tokenize the tweets
        tknzr = TweetTokenizer()
        return tknzr.tokenize(text)
    
    def tokenize_lemmatize(self, text): # tokenize the tweets
        tknzr = TweetTokenizer()
        tokens= tknzr.tokenize(text)

        lemmas = [self.lemmatizer.lemmatize(token) for token in tokens]

        return [lemma.lower() for lemma in lemmas]
    
    def vectorize(self, data):
        
        # Fit the vectorizer on the 'tweet' column
        self.tweet_vec.fit(data['tweet'])
        
        # Transform the 'tweet' column into a numerical representation
        tweet_vectors = self.tweet_vec.transform(data['tweet']) # matrix of token counts
        
        return tweet_vectors
            
    def vectorize_emojis(self, data):
        data['emojis'] = data['emojis'].apply(        
            lambda x: ast.literal_eval(x)
        )

        data['emojis'] = data['emojis'].apply(lambda x: (''.join(x) if len(x) > 0 else 'EMPTY')) # emoji가 꼭 있는 거로 data를 모아야 할 듯

        self.emoji_vec.fit(data['emojis']) # Fit the vectorizer to the 'emojis' column

        # Transform the 'emojis' column to a numerical representation
        emoji_vectors = self.emoji_vec.transform(data['emojis'])
        
        return emoji_vectors
    
    def preprocess(self, data):
        tweet_vectors = self.vectorize(data)
        emoji_vectors = self.vectorize_emojis(data)
        
        # concatenate the tweet vectors and emoji sequences into a single feature matrix
        combined_vec = np.hstack((tweet_vectors, emoji_vectors))
        return combined_vec # preprocessed data
    
dp = DataPreprocessor()
data = pd.read_csv('crawl/data.csv')

combined_vec = dp.preprocess(data)



In [6]:
display(data.head())

Unnamed: 0,tweet,label,hate_words,emojis
0,🔪 Among Us x @USER 🔪 Benoit Blanc’s next great...,0,[],🔪🔪
1,I just listed a new collab on @USER created al...,0,[],EMPTY
2,mercury + venus are now in capricorn.this is n...,0,[],EMPTY
3,@USER: Wishing an 👏🤝 prosperous happy birthday...,0,[],👏🤝🔥
4,@USER: a little gift for reaching 50k thank u ...,0,[],🔪💗


In [9]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Dense, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import regularizers
from scipy.sparse import csr_matrix, hstack

## Base

In [None]:
target_data = data['label']

reg_coeff = 0.1 # 0.001, 0.01, 0.1, 1, 10

model = Sequential()

# Get the unique words in the input data
combined_vec_matrix = hstack(combined_vec)
input_data_array = combined_vec_matrix.toarray()
X_train, X_test, y_train, y_test = train_test_split(input_data_array, target_data, test_size=0.2, random_state=42)

# Further try
# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(input_data, target_data, test_size=0.2, random_state=42)

# # Split the train data into train and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


unique_words = np.unique(X_train)

vocab_size = len(unique_words) + 1

# Set the total_cnt parameter in the Embedding layer
model.add(Embedding(vocab_size, output_dim=100))
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid',
                kernel_regularizer=regularizers.l1(reg_coeff), 
                bias_regularizer=regularizers.l2(reg_coeff)))

es_l = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
es_a = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=4)
mc = ModelCheckpoint('best_GRU.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])


history = model.fit(X_train, y_train, 
          batch_size=32,
          epochs=20, 
          verbose=1,
          validation_split=0.2, 
          callbacks=[es_l, es_a, mc])

Epoch 1/20

In [None]:
perf = model.evaluate(X_test, y_test)

print('Test loss: %.4f' % perf[0])
print('Test accuracy: %.2f' % (perf[1]*100))

## KFold

In [None]:
reg_coeff = 0.01

model = Sequential()

# Get the unique words in the input data
combined_vec_matrix = hstack(combined_vec)

non_zero_elements = combined_vec_matrix.count_nonzero()

vocab_size = non_zero_elements + 1

# Set the total_cnt parameter in the Embedding layer
model.add(Embedding(vocab_size, output_dim=100))
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid',
                kernel_regularizer=regularizers.l1(reg_coeff), 
                bias_regularizer=regularizers.l2(reg_coeff)))

es_l = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
es_a = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=4)
mc = ModelCheckpoint('best_GRU.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
# Import the KFold class
from sklearn.model_selection import KFold

# Create a KFold object with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

X = combined_vec_matrix.toarray()
y = target_data

# Loop through the folds
for train_index, val_index in kf.split(X):
    # Get the training and validation data for this fold
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Compile and train the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_fold, y_train_fold, 
              batch_size=32, 
              epochs=5, 
              verbose=1,
              validation_split=0.2,
              callbacks=[es_l, es_a, mc])
    
    # Evaluate the model on the validation data for this fold
    val_loss, val_acc = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    print(f'Fold val_loss: {val_loss:.3f}, val_acc: {val_acc:.3f}')


In [None]:
perf = model.evaluate(X_test, y_test)

print('Test loss: %.4f' % perf[0])
print('Test accuracy: %.2f' % (perf[1]*100))

## Use the pre-trained word2vec model

- Hmm..
- KFold + pre-trained

In [None]:
from gensim.models import KeyedVectors

# Load the pre-trained word2vec model
word2vec_model = KeyedVectors.load_word2vec_format('word2vec.txt', binary=False)

reg_coeff = 0.01

model = Sequential()

# Get the unique words in the input data
combined_vec_matrix = hstack(combined_vec)
non_zero_elements = combined_vec_matrix.count_nonzero()
vocab_size = non_zero_elements + 1

# Set the total_cnt parameter in the Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[word2vec_model])) # <- Here!!
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid',
                kernel_regularizer=regularizers.l1(reg_coeff), 
                bias_regularizer=regularizers.l2(reg_coeff)))

es_l = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
es_a = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=4)
mc = ModelCheckpoint('best_GRU.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
# Import the KFold class
from sklearn.model_selection import KFold

# Create a KFold object with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

X = combined_vec_matrix.toarray()
y = target_data

# Loop through the folds
for train_index, val_index in kf.split(X):
    # Get the training and validation data for this fold
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Compile and train the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_fold, y_train_fold, 
              batch_size=32, 
              epochs=5, 
              verbose=1,
              validation_split=0.2,
              callbacks=[es_l, es_a, mc])
    
    # Evaluate the model on the validation data for this fold
    val_loss, val_acc = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    print(f'Fold val_loss: {val_loss:.3f}, val_acc: {val_acc:.3f}')

# Old code

In [None]:
class DataPreprocessor:
    def __init__(self):
        self.stop_word = set(stopwords.words('english'))
        stop_word_symbol = {"…", "’", ":", '"', '-', '️', '&', '“', '(', '/', "'", ";", "+", "*", "~"}
        self.stop_word.update(stop_word_symbol)
        
        self.tokenizer = TweetTokenizer(reduce_len=True)
        self.lemmatizer = WordNetLemmatizer()
        
        self.tk = Tokenizer()
        self.total_cnt = 0
        self.max_len = 100
        
        self.regex = "RT (@[A-Za-z0-9_]+)|(@[A-Za-z0-9_]+)|https\S+|http\S+|(?<!\d)[.,;:!?](?!\d)"
    
    def preprocess(self, data_file, test_file):  
        data = pd.read_csv(data_file)
        test_data = pd.read_csv(test_file)
        data.drop_duplicates(subset=['tweet'], inplace=True)
        test_data.drop_duplicates(subset=['tweet'], inplace=True)
        
        data['tweet'] = data['tweet'].str.replace(self.regex, "")
        test_data['tweet'] = test_data['tweet'].str.replace(self.regex, "")
        
        data['tokenized'] = data['tweet'].apply(lambda x: [self.lemmatizer.lemmatize(word) for word in self.tokenizer.tokenize(x.lower()) if word not in self.stop_word])
        test_data['tokenized'] = test_data['tweet'].apply(lambda x: [self.lemmatizer.lemmatize(word) for word in self.tokenizer.tokenize(x.lower()) if word not in self.stop_word])
    
        X_data, Y_data = data[['tokenized', 'emojis']].values, data['label'].values
        X_test, Y_test = test_data[['tokenized', 'emojis']].values, test_data['label'].values
        
        self.tk_oov = Tokenizer(self.vocab_size, oov_token='OOV')
        self.tk_oov.fit_on_texts(X_data)
        
        X_data = self.tk_oov.texts_to_sequences(X_data)
        X_test = self.tk_oov.texts_to_sequences(X_test)
        
        X_data = pad_sequences(X_data, maxlen=self.max_len)
        X_test = pad_sequences(X_test, maxlen=self.max_len)

        return X_data, Y_data, X_test, Y_test, self.total_cnt
    
    def preprocess_sentence(self, sentence):
        col = ['tweet'] # ['tweet', 'emojis']
        X_df = pd.DataFrame([sentence], columns=col)

        X_df = X_df['tweet'].str.replace(self.regex, "")

        X_df['tokenized'] = X_df.apply(lambda x: [self.lemmatizer.lemmatize(word) for word in self.tokenizer.tokenize(x.lower()) if word not in self.stop_word])
        
        X = X_df['tokenized'].values
        
        X = self.tk_oov.texts_to_sequences(X)
        X = pad_sequences(X, maxlen=self.max_len)
        
        return X

In [None]:
dp = DataPreprocessor()
# X_data, Y_data, X_test, Y_test, total_cnt = dp.preprocess("./crawl/tweets-new.csv", "./crawl/tweets-new.csv")
X_data, Y_data, X_test, Y_test, total_cnt = dp.preprocess("./TweetBLM.csv", "./crawl/tweets-new.csv")

In [None]:
reg_coeff = 0.001

model = Sequential()
model.add(Embedding(total_cnt, 100))
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid',
                kernel_regularizer=regularizers.l1(reg_coeff), 
                bias_regularizer=regularizers.l2(reg_coeff)))

# callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=8)
mc = ModelCheckpoint('best_GRU.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
# train the model on the dataset
history = model.fit(X_data, Y_data, epochs=20, callbacks=[es, mc], batch_size=64, validation_split=0.2)

GRU_model = load_model('best_GRU.h5')

# evaluate the model on the test data
results = GRU_model.evaluate(X_test, Y_test)

In [None]:
print('Test loss: %.4f' % results[0])
print('Test accuracy: %.2f' % (results[1]*100))

In [None]:
bad = dp.preprocess_sentence("FUCK!!! It's terrible. Boring. Tired. Worst ever.")
good = dp.preprocess_sentence("LOVE. Like. happy. happiness. peaceful.")
neutral = dp.preprocess_sentence("bitch")

for b in GRU_model.predict(bad): print(b)
for g in GRU_model.predict(good): print(g)
for n in GRU_model.predict(neutral): print(n)

In [None]:
test_se = dp.preprocess_sentence("gross dislike hate you")
# test_se = dp.preprocess_sentence("")
GRU_model.predict(test_se)