# Overview

In [None]:
#import 
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
import os
import seaborn as sn
import matplotlib.pyplot as plt
import re
import string
print('TF version',tf.__version__)

In [None]:
os.listdir('../input/')

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

## Reading Datasets

In [None]:
#Training data
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
sample = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)
print('Testing sample shape: ', sample.shape)

# First few rows of the training dataset
train.head()

In [None]:
# First few rows of the testing dataset
test.head()

In [None]:
sample.head()

## Data exploration

In [None]:
train.info()

In [None]:
test.info()

In [None]:
### Check for null values
test.isnull().sum()

In [None]:
train.isnull().sum()

In [None]:
### Drop textID column and any null rows
train.drop(['textID'],axis=1,inplace = True)
train.dropna(axis=0,how='any',inplace = True)
test.drop(['textID'],axis=1,inplace=True)

## Visualisation

In [None]:
### Distribution of sentiment 
sn.set(style="darkgrid")
sn.countplot(x=train["sentiment"])
plt.title("Training_set")

In [None]:
sn.set(style="darkgrid")
sn.countplot(x=test["sentiment"])
plt.title("Test_set")

In [None]:
### Text length
positive=train[train["sentiment"]=="positive"]
neutral=train[train["sentiment"]=="neutral"]
negative=train[train["sentiment"]=="negative"]


sn.distplot(positive["text"].str.split().str.len(),axlabel="length of text in having postive sentiment")
plt.show()
sn.distplot(negative["text"].str.split().str.len(),axlabel="length of text in having negative sentiment")
plt.show()
sn.distplot(neutral["text"].str.split().str.len(),axlabel="length of text in having neutral sentiment")
plt.show()

## Clean text

In [None]:
def cleaning(txt):
    txt = txt.lower()
    txt = re.sub('https?://\S+|www\.\S+', '', txt)
    txt = re.sub("\n"," ", txt)
    txt = re.sub('\w*\d\w*', '', txt)
    txt = re.sub('<.*?>+', '', txt)
    # txt = re.sub('[%s]' % re.escape(string.punctuation), '', txt)
    return txt

In [None]:
train['text'] = train['text'].apply(str).apply(lambda x:cleaning(x))
test['text'] = test['text'].apply(str).apply(lambda x:cleaning(x))

In [None]:
### Clean selected text 
train['selected_text'] = train['selected_text'].apply(str).apply(lambda x:cleaning(x))

In [None]:
train.head()

# Tokenizer

In [None]:
MAX_LEN = 96
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

In [None]:
### Tensor Inputs

ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

## Encoding

In [None]:
### ENCODING ###

### For training

for k in range(train.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train.iloc[k,0].split())
    text2 = " ".join(train.iloc[k,1].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train.iloc[k,2]]
    input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask[k,:len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+1] = 1
        end_tokens[k,toks[-1]+1] = 1

In [None]:
### For Testing

ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test.iloc[k,0].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.iloc[k,1]]
    input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k,:len(enc.ids)+5] = 1

## Build Model

In [None]:
### Model Building

def build_model():
    #Inputs
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    
    #Pretrained roBERTa Config
    config = RobertaConfig.from_pretrained(PATH + 'config-roberta-base.json')
    #Model 
    bert_model = TFRobertaModel.from_pretrained(PATH + 'pretrained-roberta-base.h5', config=config)
    x = bert_model(ids, attention_mask=att, token_type_ids=tok)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)
    
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    return model

## Run Model

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
%%time

# INitialize start and end token
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

jac = []; VER='v0'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=123)
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):

    print('### FOLD %i'%(fold+1))
    
    K.clear_session()
    model = build_model()
    sv = tf.keras.callbacks.ModelCheckpoint(
        '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')
        
    model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
        epochs=3, batch_size=32, verbose=DISPLAY, callbacks=[sv],
        validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]], 
        [start_tokens[idxV,], end_tokens[idxV,]]))
    
    model.load_weights('%s-roberta-%i.h5'%(VER,fold))
    
    print('Predicting OOF...')
    oof_start[idxV,],oof_end[idxV,] = model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    print('Predicting Test...')
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        if a>b: 
            st = train.iloc[k,0] # IMPROVE CV/LB with better choice here
        else:
            text1 = " "+" ".join(train.iloc[k,0].split())
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
        all.append(jaccard(st,train.iloc[k,1]))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
    print()

In [None]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.iloc[k,0]
    else:
        text1 = " "+" ".join(test.iloc[k,0].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)

In [None]:
test1 = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')

In [None]:
test['selected_text'] = all
test['textID'] = test1['textID']
test.drop(['text'],axis=1,inplace=True)
test.drop(['sentiment'],axis=1,inplace=True)
test = test[['textID','selected_text']]
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
test.head()