In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install contractions
!pip install pyspellchecker
!pip install -U textblob

In [None]:
!python -m textblob.download_corpora

In [None]:
!pip install wordcloud

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
!pip install seaborn
import numpy as np
import pandas as pd
import re
import nltk 
import string
import contractions
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Roberta Base

In [None]:
# load the data to a pandas dataframe
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
train_data, test_data = train_test_split(train, test_size = 0.2)
train_data, valid_data = train_test_split(train_data, test_size=0.05)
abbreviations = pd.read_csv("/kaggle/input/chat-slang-abbreviations-acronyms/slang/slang.csv")
abrevtn_dict   = dict(zip(abbreviations.acronym, abbreviations.expansion))

In [None]:
train_data

In [None]:
test_data

In [None]:
valid_data

In [None]:
!pip install -q -U watermark     
!pip install seaborn
!pip install -qq transformers

%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def clear_text(text):
  text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
  text=re.sub('`',"'",text)        #Replacing apostrophe
  text=re.sub('\S*\d\S*',' ',text) #Removing Numbers
  text=re.sub('<.*?>+',' ',text)   #Removing Angular Brackets
  text=re.sub('\[.*?\]',' ',text)  #Removing Square Brackets
  text=re.sub('\n',' ',text)       #Removing '\n' character 
  text=re.sub('\*+','INSULT',text) #Replacing **** by INSULT
  return text

In [None]:
def remove_hyperlinks(text):
  hyperlinkfree = re.sub('https?://\S+|www\.\S+', '', text)
  return hyperlinkfree

In [None]:

train_data['text'] = train_data['text'].apply(remove_hyperlinks)
train_data['text'] = train_data['text'].apply(clear_text)

valid_data['text'] = valid_data['text'].apply(remove_hyperlinks)
valid_data['text'] = valid_data['text'].apply(clear_text)


In [None]:
train_data.dropna(axis = 0,inplace=True)
test_data.dropna(axis = 0,inplace=True)
valid_data.dropna(axis=0, inplace=True)

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
train_data['text'] = train_data['text'].astype(str)
train_data['selected_text'] = train_data['selected_text'].astype(str)

valid_data['text'] = valid_data['text'].astype(str)
valid_data['selected_text'] = valid_data['selected_text'].astype(str)

test_data['text'] = test_data['text'].astype(str)
train_data = train_data.reset_index()
valid_data = valid_data.reset_index()
test_data = test_data.reset_index()

In [None]:
def custom_loss(y_true, y_pred):
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred, from_logits = False, label_smoothing = 0.20)
    loss = tf.reduce_mean(loss)
    return loss

In [None]:
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.layers import Input,Softmax,Dense,Activation,Dropout
from transformers import *
import tokenizers

### Code Explanation

This code snippet serves a critical role in data preprocessing for sentiment analysis, potentially leveraging the RoBERTa model architecture. It begins by initializing a tokenizer tailored for RoBERTa, which is fundamental for encoding text data effectively. Sentiment labels are mapped to their corresponding token IDs to facilitate sentiment analysis.

The training data is reset, and NumPy arrays are prepared to store tokenized input, attention masks, and position masks. These arrays are crucial for shaping the input data for model training.

The code then enters a loop, processing each training example. During this loop, it tokenizes and encodes the text and selected text, determining their positions in the encoded sequences. The input sequences are constructed, incorporating special tokens and sentiment information.

Additionally, binary masks are generated to identify the start and end positions of the selected text within the sequence. Overall, this code segment is indispensable for NLP tasks, ensuring that data is properly prepared for training sentiment analysis models.


In [None]:
# Define the maximum sequence length for tokenization
max_len = 128

# Initialize a ByteLevelBPETokenizer with RoBERTa vocabulary and merges
tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab='/kaggle/input/tf-roberta/vocab-roberta-base.json',
            merges='/kaggle/input/tf-roberta/merges-roberta-base.txt',
            lowercase=True,
            add_prefix_space=True
)

# Define sentiment IDs for 'positive', 'negative', and 'neutral'
sentiment_id = {'positive': tokenizer.encode('positive').ids[0],
                'negative': tokenizer.encode('negative').ids[0],
                'neutral': tokenizer.encode('neutral').ids[0]}

# Reset the index of the training data
train_data.reset_index(inplace=True)

# Prepare input data for training
tot_tw = train_data.shape[0]

# Initialize arrays to store tokenized input data, attention masks, and masks for start and end positions
input_ids = np.ones((tot_tw, max_len), dtype='int32')
attention_mask = np.zeros((tot_tw, max_len), dtype='int32')
token_type_ids = np.zeros((tot_tw, max_len), dtype='int32')
start_mask = np.zeros((tot_tw, max_len), dtype='int32')
end_mask = np.zeros((tot_tw, max_len), dtype='int32')

# Loop through each training example
for i in range(tot_tw):
    # Preprocess the text and selected text
    set1 = " "+" ".join(train_data.loc[i,'text'].split())
    set2 = " ".join(train_data.loc[i,'selected_text'].split())
    idx = set1.find(set2)
    set2_loc = np.zeros((len(set1)))
    set2_loc[idx:idx+len(set2)] = 1
    if set1[idx-1] == " ":
        set2_loc[idx-1] = 1
  
    # Tokenize and encode the text
    enc_set1 = tokenizer.encode(set1)

    selected_text_token_idx = []
    # Find tokens that correspond to selected text
    for k, (a, b) in enumerate(enc_set1.offsets):
        sm = np.sum(set2_loc[a:b]) 
        if sm > 0:
            selected_text_token_idx.append(k)

    # Get the sentiment token
    senti_token = sentiment_id[train_data.loc[i,'sentiment']]
    
    # Construct input sequence and update attention mask
    input_ids[i, :len(enc_set1.ids) + 5] = [0] + enc_set1.ids + [2, 2] + [senti_token] + [2] 
    attention_mask[i, :len(enc_set1.ids) + 5] = 1

    # Update start and end masks if selected text tokens exist
    if len(selected_text_token_idx) > 0:
        start_mask[i, selected_text_token_idx[0] + 1] = 1
        end_mask[i, selected_text_token_idx[-1] + 1] = 1

# The code above processes and prepares the input data for training an NLP model for sentiment analysis.


### Validation Data Preprocessing

This section of code follows a similar pattern to the previous one but is tailored for preparing the validation data for a sentiment analysis model. The validation data is reset for consistency.

Arrays are initialized to store tokenized validation input, attention masks, and position masks, mirroring the structure used for training data.

Within a loop iterating through each validation example, text and selected text are processed and tokenized. The position of selected text within the text is determined, and the encoding process is applied.

Just like in the training data preprocessing, the code constructs input sequences by adding special tokens and sentiment information. Additionally, it generates start and end masks to identify the positions of selected text within the sequence.

This code is crucial for ensuring the validation data is appropriately formatted and ready for evaluation during the model training process.


In [None]:
# Reset the index of the validation data
valid_data.reset_index(inplace=True)

# Prepare input data for validation
tot_tw_val = valid_data.shape[0]

# Initialize arrays to store tokenized validation input data, attention masks, and masks for start and end positions
input_ids_val = np.ones((tot_tw_val, max_len), dtype='int32')
attention_mask_val = np.zeros((tot_tw_val, max_len), dtype='int32')
token_type_ids_val = np.zeros((tot_tw_val, max_len), dtype='int32')
start_mask_val = np.zeros((tot_tw_val, max_len), dtype='int32')
end_mask_val = np.zeros((tot_tw_val, max_len), dtype='int32')

# Loop through each validation example
for i in range(tot_tw_val):
    # Preprocess the text and selected text in the validation data
    set1 = " "+" ".join(valid_data.loc[i,'text'].split())
    set2 = " ".join(valid_data.loc[i,'selected_text'].split())
    idx = set1.find(set2)
    set2_loc = np.zeros((len(set1)))
    set2_loc[idx:idx+len(set2)] = 1
    if set1[idx-1] == " ":
        set2_loc[idx-1] = 1
  
    # Tokenize and encode the text
    enc_set1 = tokenizer.encode(set1)

    selected_text_token_idx = []
    # Find tokens that correspond to selected text
    for k, (a, b) in enumerate(enc_set1.offsets):
        sm = np.sum(set2_loc[a:b]) 
        if sm > 0:
            selected_text_token_idx.append(k)

    # Get the sentiment token
    senti_token = sentiment_id[valid_data.loc[i,'sentiment']]
    
    # Construct input sequence and update attention mask for validation data
    input_ids_val[i, :len(enc_set1.ids) + 5] = [0] + enc_set1.ids + [2, 2] + [senti_token] + [2] 
    attention_mask_val[i, :len(enc_set1.ids) + 5] = 1

    # Update start and end masks if selected text tokens exist
    if len(selected_text_token_idx) > 0:
        start_mask_val[i, selected_text_token_idx[0] + 1] = 1
        end_mask_val[i, selected_text_token_idx[-1] + 1] = 1

# The code above processes and prepares the input data for validation, similar to the training data preprocessing.


### Test Data Preprocessing

This section of code is dedicated to preparing the test data for evaluation with a sentiment analysis model. It begins by resetting the index of the test data for consistency.

Arrays are initialized to store tokenized test input data, attention masks, and token type IDs, following a structure similar to the one used for training and validation data.

Within a loop that iterates through each test example, the code preprocesses the text data by adding spaces and tokenizes it. Sentiment information is obtained and used to construct the input sequence by adding special tokens.

Additionally, attention masks are generated to indicate which tokens should be attended to during evaluation.

The purpose of this code segment is to ensure that the test data is appropriately formatted and ready for use in evaluating the sentiment analysis model's performance.


In [None]:
# Reset the index of the test data
test_data.reset_index(inplace=True)

# Get the total number of test examples
tot_test_tw = test_data.shape[0]

# Initialize arrays to store tokenized test input data, attention masks, and token type IDs
input_ids_t = np.ones((tot_test_tw, max_len), dtype='int32')
attention_mask_t = np.zeros((tot_test_tw, max_len), dtype='int32')
token_type_ids_t = np.zeros((tot_test_tw, max_len), dtype='int32')

# Loop through each test example
for i in range(tot_test_tw):
    # Preprocess the text in the test data
    set1 = " " + " ".join(test_data.loc[i, 'text'].split())
    
    # Tokenize and encode the text
    enc_set1 = tokenizer.encode(set1)

    # Get the sentiment token
    s_token = sentiment_id[test_data.loc[i, 'sentiment']]
    
    # Construct input sequence and update attention mask for test data
    input_ids_t[i, :len(enc_set1.ids) + 5] = [0] + enc_set1.ids + [2, 2] + [s_token] + [2]
    attention_mask_t[i, :len(enc_set1.ids) + 5] = 1


### Model Architecture Definition

This Python function defines the architecture of a sentiment analysis model. It takes token IDs, attention masks, and token type IDs as input.

First, it loads a pre-trained RoBERTa model and its configuration. The input data is passed through RoBERTa to obtain contextual embeddings.

Two branches for sentiment prediction are created, one for start positions (`x1`) and one for end positions (`x2`). These branches consist of dropout layers, convolutional layers, activation functions, and dense layers.

- Each branch starts with a dropout layer to prevent overfitting.
- Convolutional layers with different filters and kernel sizes are used to capture features from the RoBERTa embeddings.
- Leaky ReLU activation functions introduce non-linearity.
- Dense layers with a single neuron are used for each branch.
- Flatten layers prepare the output for the final activation function.

The model is then defined with the specified inputs and outputs, and it returns the sentiment predictions for both start and end positions.

This code defines the architecture for a sentiment analysis model, which can be trained and evaluated on the provided data.


In [None]:
def build_model():
    # Define input layers for token IDs, attention masks, and token type IDs
    ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    att = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    tok = tf.keras.layers.Input((max_len,), dtype=tf.int32)

    # Load the pre-trained RoBERTa model and its configuration
    config_path = RobertaConfig.from_pretrained('/kaggle/input/tf-roberta/config-roberta-base.json')
    roberta_model = TFRobertaModel.from_pretrained('/kaggle/input/tf-roberta/pretrained-roberta-base.h5', config=config_path)

    # Pass input through the RoBERTa model
    x = roberta_model(ids, attention_mask=att, token_type_ids=tok)

    # Build the sentiment prediction layers for start and end positions
    x1 = tf.keras.layers.Dropout(0.05)(x[0])
    x1 = tf.keras.layers.Conv1D(128, 2, padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2, padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)

    x2 = tf.keras.layers.Dropout(0.05)(x[0])
    x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    # Define the model with inputs and outputs
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1, x2])
    return model


In [None]:
model_roberta = build_model()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) 
model_roberta.compile(loss=custom_loss, optimizer=optimizer)

In [None]:
model_roberta.summary()

In [None]:
tf.keras.utils.plot_model(model_roberta, 'Model.png',show_shapes=True)

### Data Preparation and Dataset Creation

In this code, the input data for training, validation, and testing is organized and converted into TensorFlow datasets.

- For training, `input_data` contains token IDs, attention masks, and token type IDs, while `output_data` contains start and end position masks. A TensorFlow dataset is created from these inputs and outputs, and it's shuffled and batched to facilitate training. The `buffer_size` controls the shuffling.

- For validation, `input_data_val` and `output_data_val` are similarly organized and used to create a TensorFlow dataset. This dataset is not shuffled as it's typically used for evaluation.

- For testing, `input_data_test` is prepared to hold token IDs, attention masks, and token type IDs for the test data.

These datasets are essential for training, validating, and testing the sentiment analysis model, and they ensure that the data is efficiently processed in batches during training and evaluation.


In [None]:
# Prepare training input and output data
input_data = (input_ids, attention_mask, token_type_ids)
output_data = (start_mask, end_mask)

# Create a TensorFlow dataset from the training data, shuffling and batching it
train_dataset = tf.data.Dataset.from_tensor_slices((input_data, output_data)).shuffle(buffer_size=1024).batch(32)

# Prepare validation input and output data
input_data_val = (input_ids_val, attention_mask_val, token_type_ids_val)
output_data_val = (start_mask_val, end_mask_val)

# Create a TensorFlow dataset from the validation data, batching it
valid_dataset = tf.data.Dataset.from_tensor_slices((input_data_val, output_data_val)).batch(32)

# Prepare test input data
input_data_test = (input_ids_t, attention_mask_t, token_type_ids_t)


In [None]:
model_history = model_roberta.fit(train_dataset, validation_data = valid_dataset, epochs=20)

In [None]:
start_pred , end_pred = model_roberta.predict([input_ids_t, attention_mask_t, token_type_ids_t])
start_pred.shape,end_pred.shape

### Generating Predicted Selected Text

This code segment is responsible for generating predictions for the selected text in the test data based on the model's start and end position predictions.

- An empty list named `all` is initialized to store the predicted selected text for each test example.

- The code iterates through each test example, obtaining the positions of the start and end tokens with the highest predicted probabilities.

- If the start position is greater than the end position, it implies that the model predicts no valid span within the text. In this case, the entire original text is considered as the predicted selected text.

- If there's a valid span (start position is less than or equal to the end position), the code reconstructs the selected text by tokenizing the original text, selecting the corresponding tokens, and decoding them.

- The predicted selected text is then appended to the `all` list.

- Finally, the predicted selected text is added as a new column ('pred_selected_text') in the `test_data` DataFrame, making it available for evaluation and comparison with the ground truth selected text.

This code is essential for evaluating the model's performance in predicting selected text in the test dataset.


In [None]:
# Initialize an empty list to store predicted selected text
all = []

# Loop through each test example
for k in range(input_ids_t.shape[0]):
    # Find the start and end positions with the highest probabilities
    a = np.argmax(start_pred[k,])
    b = np.argmax(end_pred[k,])
    
    # Check if the start position is greater than the end position
    if a > b:
        # If so, the selected text is the entire original text
        st = test_data.loc[k,'text'] 
    else:
        # If not, reconstruct the selected text from tokenized input
        text1 = " " + " ".join(test_data.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    
    # Append the predicted selected text to the list
    all.append(st)

# Add the predicted selected text to the test_data DataFrame
test_data['pred_selected_text'] = all


In [None]:
test_data

In [None]:
scores=[]
for i in tqdm(range(len(test_data))):
    scores.append(jaccard(test_data['selected_text'][i],test_data['pred_selected_text'][i]))

In [None]:
test_data['jaccard_scores'] = scores  

In [None]:
test_data.sample(20)

In [None]:
print('Mean jaccard score for neutral data:',test_data[test_data.sentiment =='neutral']['jaccard_scores'].mean())
print('Mean jaccard score for positive data:',test_data[test_data.sentiment =='positive']['jaccard_scores'].mean())
print('Mean jaccard score for negative data:',test_data[test_data.sentiment =='negative']['jaccard_scores'].mean())

In [None]:
print('Mean jaccard score for all data:',test_data['jaccard_scores'].mean())

In [None]:
test_data[test_data['jaccard_scores'] == 1]

In [None]:
test_data[test_data['jaccard_scores'] != 1]

In [None]:
# summarize history for loss
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['loss','val_loss'], loc='upper left')
plt.show()

In [None]:
model_roberta.save_weights('/kaggle/working/roberta-base-1')

In [None]:
import pickle 
with open('/kaggle/working/roberta_hist', 'wb') as file_pi:
    pickle.dump(model_history.history, file_pi)

In [None]:
import seaborn as sns

sns.distplot(test_data['jaccard_scores'], hist = True, kde = True, 
             color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})

plt.title('Density Plot of jaccard scores')
plt.xlabel('jaccard score')
plt.ylabel('Density')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sentiments = ['positive', 'negative', 'neutral']

for sentiment in sentiments:
    # Subset to the airline
    subset = test_data[test_data['sentiment'] == sentiment]
    
    # Draw the density plot
    sns.distplot(subset["jaccard_scores"], hist=False, kde=True,
                 kde_kws={'shade': True, 'linewidth': 3},
                 label=sentiment)

plt.legend(prop={'size': 16}, title='sentiment')
plt.title('Density Plot of jaccard scores for each sentiment')
plt.xlabel('jaccard score')
plt.ylabel('Density')

plt.show()

# Roberta Large

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
train_data, test_data = train_test_split(train, test_size = 0.2)
train_data, valid_data = train_test_split(train_data, test_size=0.05)
abbreviations = pd.read_csv("/kaggle/input/chat-slang-abbreviations-acronyms/slang/slang.csv")
abrevtn_dict   = dict(zip(abbreviations.acronym, abbreviations.expansion))

In [None]:
train_data.dropna(axis = 0,inplace=True)
test_data.dropna(axis = 0,inplace=True)
valid_data.dropna(axis=0, inplace=True)

In [None]:
train_data['text'] = train_data['text'].astype(str)
train_data['selected_text'] = train_data['selected_text'].astype(str)

valid_data['text'] = valid_data['text'].astype(str)
valid_data['selected_text'] = valid_data['selected_text'].astype(str)

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel, RobertaConfig
#model = TFRobertaModel.from_pretrained('roberta-large')
import tokenizers

### Larger Tokenizer Data Preprocessing

This code segment involves using a larger tokenizer for data preprocessing, which may have a different vocabulary and configurations compared to the previous tokenizer.

- The larger tokenizer is initialized with its own vocabulary and merges information.

- Sentiment labels are mapped to their corresponding token IDs using the larger tokenizer.

- The training data index is reset for consistency.

- Input data arrays for tokenized input, attention masks, and position masks are initialized.

- The code proceeds to preprocess each training example using the larger tokenizer. It tokenizes and encodes the text and selected text, determining their positions in the encoded sequences.

- Sentiment tokens are obtained using the larger tokenizer.

- Input sequences are constructed with special tokens and sentiment information, and attention masks are updated accordingly.

- Start and end masks are generated if selected text tokens exist.

This code segment prepares the input data for training, taking into account the specifics of the larger tokenizer's vocabulary and configuration.


In [None]:
# Define the maximum sequence length for tokenization
max_len = 128

# Initialize a larger ByteLevelBPETokenizer with a different vocabulary
tokenizer_large = tokenizers.ByteLevelBPETokenizer(
            vocab='/kaggle/input/robertalarge/vocab.json',
            merges='/kaggle/input/robertalarge/merges.txt',
            lowercase=True,
            add_prefix_space=True
)

# Define sentiment IDs for 'positive', 'negative', and 'neutral'
sentiment_id_large = {'positive': tokenizer_large.encode("input_ids").ids[0],
                'negative': tokenizer_large.encode('negative').ids[0],
                'neutral': tokenizer_large.encode('neutral').ids[0]}

# Reset the index of the training data
train_data.reset_index(inplace=True)

# input data formatting for training
tot_tw = train_data.shape[0]

# Initialize arrays to store tokenized input data, attention masks, and position masks
input_ids_large = np.ones((tot_tw, max_len), dtype='int32')
attention_mask_large = np.zeros((tot_tw, max_len), dtype='int32')
token_type_ids_large = np.zeros((tot_tw, max_len), dtype='int32')
start_mask_large = np.zeros((tot_tw, max_len), dtype='int32')
end_mask_large = np.zeros((tot_tw, max_len), dtype='int32')

# Loop through each training example
for i in range(tot_tw):
    # Preprocess the text and selected text
    set1 = " " + " ".join(train_data.loc[i, 'text'].split())
    set2 = " ".join(train_data.loc[i, 'selected_text'].split())
    idx = set1.find(set2)
    set2_loc = np.zeros((len(set1)))
    set2_loc[idx:idx+len(set2)] = 1
    if set1[idx-1] == " ":
        set2_loc[idx-1] = 1
  
    # Tokenize and encode the text using the larger tokenizer
    enc_set1 = tokenizer_large.encode(set1)

    selected_text_token_idx = []
    # Find tokens that correspond to selected text
    for k, (a, b) in enumerate(enc_set1.offsets):
        sm = np.sum(set2_loc[a:b]) 
        if sm > 0:
            selected_text_token_idx.append(k)

    # Get the sentiment token using the larger tokenizer
    senti_token = sentiment_id_large[train_data.loc[i, 'sentiment']]
    input_ids_large[i, :len(enc_set1.ids) + 5] = [0] + enc_set1.ids + [2, 2] + [senti_token] + [2] 
    attention_mask_large[i, :len(enc_set1.ids) + 5] = 1

    # Update start and end masks if selected text tokens exist
    if len(selected_text_token_idx) > 0:
        start_mask_large[i, selected_text_token_idx[0] + 1] = 1
        end_mask_large[i, selected_text_token_idx[-1] + 1] = 1


### Larger Tokenizer Data Preparation for Validation

This code segment is responsible for preparing the validation data using the larger tokenizer. It ensures that the validation data is properly tokenized and formatted for model evaluation.

- The validation data index is reset for consistency.

- Arrays are initialized to store tokenized validation input data, attention masks, and position masks, similar to the training data.

- The code iterates through each validation example, preprocessing the text and selected text in the same manner as for training data.

- Tokenization and encoding are performed using the larger tokenizer, and the positions of selected text tokens are identified.

- Sentiment tokens are obtained using the larger tokenizer.

- Input sequences are constructed with special tokens and sentiment information, and attention masks are updated accordingly.

- Start and end masks are generated if selected text tokens exist for each validation example.

This code ensures that the validation data is appropriately processed and formatted for evaluation with the model using the larger tokenizer.


In [None]:
# Reset the index of the validation data
valid_data.reset_index(inplace=True)

# Get the total number of validation examples
tot_tw_val = valid_data.shape[0]

# Initialize arrays to store tokenized validation input data, attention masks, and position masks
input_ids_val_large = np.ones((tot_tw_val, max_len), dtype='int32')
attention_mask_val_large = np.zeros((tot_tw_val, max_len), dtype='int32')
token_type_ids_val_large = np.zeros((tot_tw_val, max_len), dtype='int32')
start_mask_val_large = np.zeros((tot_tw_val, max_len), dtype='int32')
end_mask_val_large = np.zeros((tot_tw_val, max_len), dtype='int32')

# Loop through each validation example
for i in range(tot_tw_val):
    # Preprocess the text and selected text for validation
    set1 = " " + " ".join(valid_data.loc[i, 'text'].split())
    set2 = " ".join(valid_data.loc[i, 'selected_text'].split())
    idx = set1.find(set2)
    set2_loc = np.zeros((len(set1)))
    set2_loc[idx:idx+len(set2)] = 1
    if set1[idx-1] == " ":
        set2_loc[idx-1] = 1
  
    # Tokenize and encode the text using the larger tokenizer
    enc_set1 = tokenizer_large.encode(set1)

    selected_text_token_idx = []
    # Find tokens that correspond to selected text
    for k, (a, b) in enumerate(enc_set1.offsets):
        sm = np.sum(set2_loc[a:b]) 
        if sm > 0:
            selected_text_token_idx.append(k)

    # Get the sentiment token using the larger tokenizer
    senti_token = sentiment_id_large[valid_data.loc[i, 'sentiment']]
    input_ids_val_large[i, :len(enc_set1.ids) + 5] = [0] + enc_set1.ids + [2, 2] + [senti_token] + [2] 
    attention_mask_val_large[i, :len(enc_set1.ids) + 5] = 1

    # Update start and end masks if selected text tokens exist
    if len(selected_text_token_idx) > 0:
        start_mask_val_large[i, selected_text_token_idx[0] + 1] = 1
        end_mask_val_large[i, selected_text_token_idx[-1] + 1] = 1


### Larger Tokenizer Data Preparation for Testing

This code segment prepares the test data for evaluation using the larger tokenizer. It ensures that the test data is properly tokenized and formatted for model predictions.

- The test data index is reset for consistency.

- Arrays are initialized to store tokenized test input data, attention masks, and token type IDs, similar to the training and validation data.

- The code iterates through each test example, preprocessing the text in the same manner as for training and validation data.

- Tokenization and encoding are performed using the larger tokenizer.

- Sentiment tokens are obtained using the larger tokenizer.

- Input sequences are constructed with special tokens and sentiment information, and attention masks are updated accordingly.

This code segment ensures that the test data is appropriately processed and formatted for making predictions with the model using the larger tokenizer.


In [None]:
# Reset the index of the test data
test_data.reset_index(inplace=True)

# Get the total number of test examples
tot_test_tw = test_data.shape[0]

# Initialize arrays to store tokenized test input data, attention masks, and token type IDs
input_ids_t_large = np.ones((tot_test_tw, max_len), dtype='int32')
attention_mask_t_large = np.zeros((tot_test_tw, max_len), dtype='int32')
token_type_ids_t_large = np.zeros((tot_test_tw, max_len), dtype='int32')

# Loop through each test example
for i in range(tot_test_tw):
    # Preprocess the text for testing
    set1 = " " + " ".join(test_data.loc[i, 'text'].split())
    
    # Tokenize and encode the text using the larger tokenizer
    enc_set1 = tokenizer_large.encode(set1)

    # Get the sentiment token using the larger tokenizer
    s_token = sentiment_id_large[test_data.loc[i, 'sentiment']]
    
    # Update input sequences and attention masks
    input_ids_t_large[i, :len(enc_set1.ids) + 5] = [0] + enc_set1.ids + [2, 2] + [s_token] + [2]
    attention_mask_t_large[i, :len(enc_set1.ids) + 5] = 1


### Building a Model with Larger Tokenizer

This code defines a function to build a neural model for text sentiment analysis using a larger tokenizer and pre-trained Roberta architecture.

- The function takes three inputs: token IDs, attention masks, and token type IDs.

- A configuration for the Roberta model is initialized.

- The pre-trained Roberta model with the larger tokenizer ('roberta-large') is loaded.

- Tokenized inputs are passed through the model, resulting in encoded representations.

- Two sub-models are defined for predicting start and end positions. These sub-models consist of convolutional layers, dropout, activation functions, and dense layers.

- The final model is created by specifying the input and output layers.

This code segment encapsulates the creation of a neural model suitable for sentiment analysis with a larger tokenizer, leveraging pre-trained representations from 'roberta-large'.


In [None]:
# Define a function to build a model using the larger tokenizer
def build_model_large():
    # Define input layers for token IDs, attention masks, and token type IDs
    ids = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    att = tf.keras.layers.Input((max_len,), dtype=tf.int32)
    tok =  tf.keras.layers.Input((max_len,), dtype=tf.int32) 

    # Initialize a configuration for the Roberta model
    config_path = RobertaConfig()

    # Load the pre-trained Roberta model with the larger tokenizer
    roberta_model_large = TFRobertaModel.from_pretrained('roberta-large')

    # Pass inputs through the Roberta model
    x = roberta_model_large(ids, attention_mask=att, token_type_ids=tok)

    # Define the model architecture for start position prediction
    x1 = tf.keras.layers.Dropout(0.05)(x[0])
    x1 = tf.keras.layers.Conv1D(128, 2, padding='same')(x1)  # 128 filters; 2 is the kernel size of each filter
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2, padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)

    # Define the model architecture for end position prediction
    x2 = tf.keras.layers.Dropout(0.05)(x[0]) 
    x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    # Create the model with inputs and outputs
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1, x2])
    
    return model


In [None]:
model_roberta_large = build_model_large()

In [None]:
optimizer_large = tf.keras.optimizers.Adam(learning_rate=3e-5) 
model_roberta_large.compile(loss=custom_loss, optimizer=optimizer_large)

In [None]:
model_roberta_large.summary()

In [None]:
tf.keras.utils.plot_model(model_roberta_large, 'Model_large.png',show_shapes=True)

### Data Preparation and Dataset Creation for Larger Tokenizer

This code segment prepares the data input and output for the model using the larger tokenizer and creates datasets for training, validation, and testing.

- Input and output data are organized into tuples for training and validation with the larger tokenizer.

- A training dataset is created by slicing the input and output data tensors, shuffling the data, and batching it into smaller groups (batch size 16 in this case) for efficient training.

- Similar steps are followed to create a validation dataset from tensor slices, where shuffling is not necessary.

- Input data for testing with the larger tokenizer is also defined as a tuple.

This code ensures that the data is appropriately formatted and organized for training, validation, and testing, facilitating the training and evaluation of the model with the larger tokenizer.


In [None]:
# Define input and output data for the larger tokenizer
input_data_large = (input_ids_large, attention_mask_large, token_type_ids_large)
output_data_large = (start_mask_large, end_mask_large)

# Create a training dataset from tensor slices, shuffling the data and batching it
train_dataset_large = tf.data.Dataset.from_tensor_slices((input_data_large, output_data_large)).shuffle(buffer_size=1024).batch(16)

# Define input and output data for validation using the larger tokenizer
input_data_val_large = (input_ids_val_large, attention_mask_val_large, token_type_ids_val_large)
output_data_val_large = (start_mask_val_large, end_mask_val_large)

# Create a validation dataset from tensor slices and batch it
valid_dataset_large = tf.data.Dataset.from_tensor_slices((input_data_val_large, output_data_val_large)).batch(16)

# Define input data for testing with the larger tokenizer
input_data_test_large = (input_ids_t_large, attention_mask_t_large, token_type_ids_t_large)


In [None]:
model_history_large = model_roberta_large.fit(train_dataset_large, validation_data = valid_dataset_large, epochs=20)

In [None]:
start_pred , end_pred = model_roberta_large.predict([input_ids_t_large, attention_mask_t_large, token_type_ids_t_large])
start_pred.shape,end_pred.shape

### Generating Predictions with Larger Tokenizer

This code segment is responsible for generating predictions for the selected text using the larger tokenizer and the model's output.

- An empty list is initialized to store the predicted selected text for each example.

- The code iterates through each example in the test data and determines the predicted start and end positions by finding the indices of maximum values in the model's output.

- If the predicted start position is greater than the end position, it implies that the model did not find a valid selected text span, so the entire text is used as the predicted selected text.

- Otherwise, the code tokenizes and decodes the text using the larger tokenizer to obtain the predicted selected text.

- The predicted selected text is appended to the list.

- Finally, the predicted selected text is added as a new column to the test_data DataFrame.

This code is essential for generating predictions for the test data using the model and larger tokenizer, which can be further evaluated for accuracy.


In [None]:
# Initialize an empty list to store predicted selected text
all = []

# Loop through each example in the test data
for k in range(input_ids_t_large.shape[0]):
    # Find the indices of the predicted start and end positions
    a = np.argmax(start_pred[k,])
    b = np.argmax(end_pred[k,])
    
    # Determine the selected text based on predictions
    if a > b: 
        # If the predicted start position is greater than the end position, use the entire text
        st = test_data.loc[k, 'text'] 
    else:
        # Tokenize and decode the text to get the selected text
        text1 = " " + " ".join(test_data.loc[k, 'text'].split())
        enc = tokenizer_large.encode(text1)
        st = tokenizer_large.decode(enc.ids[a-1:b])
    
    # Append the predicted selected text to the list
    all.append(st)

# Add the predicted selected text to the test_data DataFrame
test_data['pred_selected_text'] = all


In [None]:
scores=[]
for i in tqdm(range(len(test_data))):
    scores.append(jaccard(test_data['selected_text'][i],test_data['pred_selected_text'][i]))

In [None]:
test_data['jaccard_scores'] = scores 

In [None]:
test_data.sample(20)

In [None]:
print('Mean jaccard score for neutral data:',test_data[test_data.sentiment =='neutral']['jaccard_scores'].mean())
print('Mean jaccard score for positive data:',test_data[test_data.sentiment =='positive']['jaccard_scores'].mean())
print('Mean jaccard score for negative data:',test_data[test_data.sentiment =='negative']['jaccard_scores'].mean())

In [None]:
print('Mean jaccard score for all data:',test_data['jaccard_scores'].mean())

In [None]:
# summarize history for loss
plt.plot(model_history_large.history['loss'])
plt.plot(model_history_large.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['loss','val_loss'], loc='upper left')
plt.show()

In [None]:
model_roberta_large.save_weights('/kaggle/working/roberta-large-1')

In [None]:
import pickle 
with open('/kaggle/working/roberta_large_hist', 'wb') as file_pi:
    pickle.dump(model_history_large.history, file_pi)

In [None]:
import seaborn as sns

sns.distplot(test_data['jaccard_scores'], hist = True, kde = True, 
             color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})

plt.title('Density Plot of jaccard scores')
plt.xlabel('jaccard score')
plt.ylabel('Density')

In [None]:
import matplotlib.pyplot as plt

sentiments = ['positive', 'negative', 'neutral']

for sentiment in sentiments:
    # Subset to the airline
    subset = test_data[test_data['sentiment'] == sentiment]
    
    # Draw the density plot
    sns.distplot(subset["jaccard_scores"], hist=False, kde=True,
                 kde_kws={'shade': True, 'linewidth': 3},
                 label=sentiment)

plt.legend(prop={'size': 16}, title='sentiment')
plt.title('Density Plot of jaccard scores for each sentiment')
plt.xlabel('jaccard score')
plt.ylabel('Density')

plt.show()