**packages**

In [None]:
! pip install kaggle

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d kazanova/sentiment140

In [None]:
from zipfile import ZipFile

dataset = '/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
 zip.extractall()
print('data extracted')


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

import re, string
import emoji
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

import tensorflow as tf
from tensorflow import keras


from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

seed=42

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")
sns.despine()

plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)


In [None]:
def conf_matrix(y, y_pred, title):
    fig, ax = plt.subplots(figsize=(5,5))
    labels = ['Negative', 'Positive']
    cm = confusion_matrix(y, y_pred)
    ax = sns.heatmap(cm, annot=True, cmap="Blues", fmt='g', cbar=False, annot_kws={"size":25})
    plt.title(title, fontsize=20)
    ax.xaxis.set_ticklabels(labels, fontsize=17)
    ax.yaxis.set_ticklabels(labels, fontsize=17)
    ax.set_ylabel('Actual', fontsize=20)
    ax.set_xlabel('Predicted', fontsize=20)
    plt.show()

**PRE-PROCESSING IF DATA**

In [None]:
col_name = ['target','id','date','flag','user','text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names= col_name ,encoding = 'ISO-8859-1')

In [None]:
twitter_data.head()

In [None]:
twitter_data.info()

In [None]:
twitter_data['date'] = pd.to_datetime(twitter_data['date'])

In [None]:
twitter_data.drop_duplicates(subset=['text'], inplace=True)

In [None]:
twitter_data.info()

In [None]:
tweets_perday = twitter_data['date'].value_counts().sort_index().reset_index()
tweets_perday.columns = ['date', 'counts']


In [None]:
import re
import string
import emoji

def strip_emoji(text):
    return emoji.replace_emoji(text, replace='')

def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').lower() 
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)        
    text = re.sub(r'[^\x00-\x7f]',r'', text)                
    banned_list = string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

def clean_hashtags(tweet):
    new_tweet = " ".join(
        word.strip() for word in re.split(
            '#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet
        )
    ) 
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) 
    return new_tweet2

def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) or ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text):
    return re.sub(r"\s\s+", " ", text)


In [None]:
texts_new = []
for t in twitter_data.text:
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(strip_emoji(t))))))

In [None]:
twitter_data['cleaned_data'] = texts_new

In [None]:
twitter_data['cleaned_data'].head()

In [None]:
text_len = []
for text in twitter_data.cleaned_data:
    tweet_len = len(text.split())
    text_len.append(tweet_len)

In [None]:
twitter_data['text_len'] = text_len

In [None]:
twitter_data.shape

In [None]:
twitter_data = twitter_data[twitter_data['text_len'] > 3]

In [None]:
twitter_data.shape

**Training data deeper cleaning**

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
token_lens = []

for txt in twitter_data['cleaned_data'].values:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))

max_len=np.max(token_lens)

In [None]:
print(f"MAX TOKENIZED SENTENCE LENGTH: {max_len}")

In [None]:
twitter_data['target'].value_counts()

In [None]:
twitter_data.replace({'target':{4:1}},inplace= True)

In [None]:
twitter_data['target'].value_counts()

**Class Balancing by RandomOverSampler**

In [None]:
import builtins
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(
    np.array(twitter_data['cleaned_data']).reshape(-1, 1),
    np.array(twitter_data['target']).reshape(-1, 1)
)

train_y = train_y.ravel() 

train_os = pd.DataFrame(
    list(builtins.zip([x[0] for x in train_x], train_y)),
    columns=['cleaned_data', 'target']
)
print("\nAfter oversampling:")
print(train_os)

In [None]:
train_os['target'].value_counts()

**SPLITING THE DATA**

In [None]:
X = train_os['cleaned_data'].values
y = train_os['target'].values

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

In [None]:
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

In [None]:
X_train.shape, X_valid.shape, X_test.shape

**One hot encoding**

In [None]:
y_train_le = y_train.copy()
y_valid_le = y_valid.copy()
y_test_le = y_test.copy()

In [None]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = ohe.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()
y_test = ohe.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

**RoBERTa Sentiment Analysis**

In [None]:
tokenizer_roberta = RobertaTokenizerFast.from_pretrained("roberta-base")

In [None]:
token_lens = []

for txt in X_train:
    tokens = tokenizer_roberta.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)
max_length

In [None]:
MAX_LEN=128

In [None]:
def tokenize_roberta(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [None]:
train_input_ids, train_attention_masks = tokenize_roberta(X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize_roberta(X_valid, MAX_LEN)
test_input_ids, test_attention_masks = tokenize_roberta(X_test, MAX_LEN)

**Modeling**

In [None]:
def create_model(bert_model, max_len=MAX_LEN):

    opt = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-7)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(2, activation=tf.nn.softmax)(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(opt, loss=loss, metrics=accuracy)
    return model

In [None]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

In [None]:
model = create_model(roberta_model, MAX_LEN)
model.summary()

In [None]:
import numpy as np

def subsample_data(input_ids, attention_masks, labels, n_samples):
    """
    Randomly subsample input_ids, attention_masks, and labels to n_samples.
    """
    n_samples = min(n_samples, len(input_ids))
    np.random.seed(42)  
    indices = np.random.choice(len(input_ids), size=n_samples, replace=False)
    return input_ids[indices], attention_masks[indices], labels[indices]


In [None]:
n_train = 100_000
n_val = 10_000
n_test = 10_000

train_input_ids_small, train_attention_masks_small, y_train_small = subsample_data(
    train_input_ids, train_attention_masks, y_train, n_train
)

val_input_ids_small, val_attention_masks_small, y_valid_small = subsample_data(
    val_input_ids, val_attention_masks, y_valid, n_val
)

test_input_ids_small, test_attention_masks_small, y_test_small = subsample_data(
    test_input_ids, test_attention_masks, y_test, n_test
)


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('/content/checkpoint_model.h5',
                             save_best_only=False,  
                             save_freq=100)         

history = model.fit(
    [train_input_ids_small, train_attention_masks_small],
    y_train_small,
    validation_data=([val_input_ids_small, val_attention_masks_small], y_valid_small),
    epochs=4, batch_size=32,
    callbacks=[checkpoint]
)

**RESULTS**

In [None]:
result_roberta = model.predict([test_input_ids,test_attention_masks])

In [None]:
y_pred_roberta =  np.zeros_like(result_roberta)
y_pred_roberta[np.arange(len(y_pred_roberta)), result_roberta.argmax(1)] = 1

In [None]:
conf_matrix(y_test.argmax(1),y_pred_roberta.argmax(1),'RoBERTa Sentiment Analysis\nConfusion Matrix')

In [None]:
import numpy as np
from sklearn.metrics import classification_report

y_test_labels = np.argmax(y_test, axis=1) if y_test.ndim > 1 else y_test
y_pred_labels = np.argmax(y_pred_roberta, axis=1) if y_pred_roberta.ndim > 1 else y_pred_roberta

print('\tClassification Report for RoBERTa:\n\n',
      classification_report(y_test_labels, y_pred_labels, target_names=['Negative', 'Positive']))
