#Build dataset

In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split

In [2]:
dataset_path = "/content/drive/My Drive/coding/data/unique_tweets_7k.csv"

In [3]:
def load_dataset(path):
  data = pd.read_csv(path)
  tweet = data.drop_duplicates(subset=["text"])
  tweet = tweet[["text","sentiment"]].dropna()
  return tweet

In [4]:
def data_cleaning(df):
  def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)
  
  def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)
  
  def remove_emoji(text):
    emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
  
  def remove_punct(text):
        table = str.maketrans('', '', string.punctuation)
        return text.translate(table)

  def remove_multi_spaces(text):
        space = re.compile(' +')
        line = re.compile('\n')
        return space.sub(r' ', line.sub(r' ', text))

  def remove_hashtags_mentions(text):
        hashtags = re.compile(r"^#\S+|\s#\S+")
        mentions = re.compile(r"^@\S+|\s@\S+")
        text = hashtags.sub(' hashtag', text)
        text = mentions.sub(' entity', text)
        return text.strip().lower()

  df.text = df.text.apply(lambda x: remove_URL(x))
  df.text = df.text.apply(lambda x: remove_html(x))
  df.text = df.text.apply(lambda x: remove_emoji(x))
  df.text = df.text.apply(lambda x: remove_punct(x))
  df.text = df.text.apply(lambda x: remove_multi_spaces(x))
  df.text = df.text.apply(lambda x: remove_hashtags_mentions(x))
  return df

In [5]:
def balance_data(df):
  df = df.drop(df.query('sentiment == 0').sample(frac=0.7).index)
  df = df.drop(df.query('sentiment == 4').sample(frac=0.6).index)
  return df

In [6]:
def set_split(df, test_size = 0.2):
    train, test = train_test_split(df, test_size = test_size, random_state = 42)
    return train, test

In [7]:
def prepare_train_test_from_file(path):
    tweets = load_dataset(path)
    tweets = data_cleaning(tweets)
    tweets = balance_data(tweets)
    return set_split(tweets)

In [8]:
tweets = load_dataset(dataset_path)
tweets = data_cleaning(tweets)
tweets = balance_data(tweets)
train, test = set_split(tweets)

In [9]:
train.head()

Unnamed: 0,text,sentiment
3436,what the hell is going on with mastermind ridi...,2.0
1482,hi i’m done with everyone’s shit today so if a...,2.0
2138,life long fear of havin a shit and a spider cr...,3.0
1223,fuck this guy get him the fuck outta here i do...,2.0
625,ariana grandes new song is absolute shit lmao🤣...,0.0


#Preprocess data as input to BERT

In [10]:
!pip install bert-for-tf2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
!pip install numpy==1.19.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
!pip install tensorflow==2.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
import tensorflow as tf
import numpy as np

In [14]:
print(np.__version__)

1.19.5


In [15]:
print(tf.__version__)

2.2.0


In [16]:

from tqdm import tqdm
from bert.tokenization.bert_tokenization import FullTokenizer


In [17]:
class SentimentAnalysisData:
    DATA_COLUMN = "text"
    LABEL_COLUMN = "sentiment"

    def __init__(self, train, test, tokenizer: FullTokenizer, max_seq_len=192):
        self.tokenizer = tokenizer
        self.max_seq_len = 0

        ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, df):
        x, y = [], []

        for _, row in tqdm(df.iterrows()):
            text, label = row[SentimentAnalysisData.DATA_COLUMN], row[SentimentAnalysisData.LABEL_COLUMN]
            tokens = self.tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            self.max_seq_len = max(self.max_seq_len, len(token_ids))
            x.append(token_ids)
            y.append(label)

        return np.array(x), np.array(y)

    def _pad(self, ids):
        x = []
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
        return np.array(x)

#Build model

In [18]:
# import tensorflow as tf
from tensorflow import keras 
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
import os

In [19]:
BERT_MODEL_NAME = "uncased_L-12_H-768_A-12"
bert_ckpt_dir = "/content/drive/My Drive/coding/models/uncased_L-12_H-768_A-12"
BERT_CKPT_FILE = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

In [20]:
def create_model(max_seq_len, num_classes, bert_ckpt_file = BERT_CKPT_FILE):
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = None
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    bert_output = bert(input_ids)

    print("bert shape", bert_output.shape)
    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=1024, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.2)(logits)
    logits = keras.layers.Dense(units=num_classes, activation="softmax")(logits)

    model = keras.Model(inputs=input_ids, outputs=logits)
    model.build(input_shape=(None, max_seq_len))

    load_stock_weights(bert, bert_ckpt_file)

    return model

#Train model

In [21]:
#import keras
# import os
from bert.tokenization.bert_tokenization import FullTokenizer

In [None]:
dirname = "/content/drive/My Drive/coding/"
DATASET_PATH = os.path.join(dirname, "data/unique_tweets_7k.csv")
VOCAB_PATH = os.path.join(dirname, "models/uncased_L-12_H-768_A-12/vocab.txt")
MAX_SEQ_LEN = 40
tokenizer = FullTokenizer(vocab_file=VOCAB_PATH)

train, test = prepare_train_test_from_file(DATASET_PATH)
data = SentimentAnalysisData(train, test, tokenizer, max_seq_len=MAX_SEQ_LEN)
model = create_model(data.max_seq_len, 5)
model.summary()
model.compile(
    optimizer=keras.optimizers.Adam(1e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

history = model.fit(
    x=data.train_x,
    y=data.train_y,
    validation_split=0.2,
    batch_size=32,
    shuffle=True,
    epochs=12,
    verbose=1
)

_, test_acc = model.evaluate(data.test_x, data.test_y)
_, train_acc = model.evaluate(data.train_x, data.train_y)
print("Test Accuracy:" + str(test_acc))