In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [2]:
trainpath = '../input/nlp-class-fixed-data/fixed_train.csv'
validpath = '../input/nlp-class-fixed-data/fixed_valid.csv'
testpath = '../input/nlp-class-fixed-data/fixed_test.csv'

In [3]:
def prep_data(path, is_test):
    raw_df = pd.read_csv(path)
    selcol = ['conv_id', 'prompt']
    if is_test:
        # df = raw_df
        df = raw_df.groupby(selcol, as_index=False, sort=False).agg({'utterance': ' '.join})
        df_group_size = raw_df.groupby(selcol, as_index=False, sort=False).size()
    else:
        selcol.append('label')
        df = raw_df.groupby(selcol, as_index=False, sort=False).agg({'utterance': ' '.join})
        df_group_size = raw_df.groupby(selcol, as_index=False, sort=False).size()
        df = df[['label', 'conv_id', 'prompt', 'utterance']]

    return df, df_group_size

In [4]:
train, train_group_size = prep_data(trainpath, False)
valid, valid_group_size = prep_data(validpath, False)
test, test_group_size = prep_data(testpath, True)

train.head()

In [5]:
!pip install text_hammer 

In [6]:
import text_hammer as th

def text_preprocessing(df,col_name):
    column = col_name
    df[column] = df[column].progress_apply(lambda x:str(x).lower())
    df[column] = df[column].progress_apply(lambda x: th.cont_exp(x))
    #you're -> you are; i'm -> i am
    df[column] = df[column].progress_apply(lambda x: th.remove_emails(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_html_tags(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_special_chars(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_accented_chars(x))
    df[column] = df[column].progress_apply(lambda x: th.make_base(x)) #ran -> run,
    return df

In [7]:
train = text_preprocessing(train, 'prompt')
train = text_preprocessing(train, 'utterance')

valid = text_preprocessing(valid, 'prompt')
valid= text_preprocessing(valid, 'utterance')

test = text_preprocessing(test, 'prompt')
test = text_preprocessing(test, 'utterance')

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
X_train_ids = np.zeros((len(train), 256))
X_train_masks = np.zeros((len(train), 256))

X_valid_ids = np.zeros((len(valid), 256))
X_valid_masks = np.zeros((len(valid), 256))

X_test_ids = np.zeros((len(test), 256))
X_test_masks = np.zeros((len(test), 256))

In [11]:
def generate_training_data(df, ids, masks, tokenizer):
    for i in range(len(df)):
        text_a = df['prompt'][i]
        text_b = df['utterance'][i]
    # for i, text in tqdm(enumerate(df['prompt'])):
        tokenized_text = tokenizer.encode_plus(
            text_a, text_b,
            max_length=256, 
            truncation='only_second', 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [12]:
X_train_ids, X_train_masks = generate_training_data(train, X_train_ids, X_train_masks, tokenizer)
X_valid_ids, X_valid_masks = generate_training_data(valid, X_valid_ids, X_valid_masks, tokenizer)
X_test_ids, X_test_masks = generate_training_data(test, X_test_ids, X_test_masks, tokenizer)

In [13]:
train_labels = np.zeros((len(train), 32))
valid_labels = np.zeros((len(valid), 32))

In [14]:
train_labels[np.arange(len(train)), train['label'].values] = 1 # one-hot encoded target tensor
valid_labels[np.arange(len(valid)), valid['label'].values] = 1

In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_ids, X_train_masks, train_labels))
valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid_ids, X_valid_masks, valid_labels))

In [16]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [17]:
train_dataset = train_dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset
valid_dataset = valid_dataset.map(SentimentDatasetMapFunction)

In [18]:
train_dataset = train_dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
valid_dataset = valid_dataset.shuffle(10000).batch(16, drop_remainder=True)

# Model

In [19]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=32)

In [20]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
dropout_layer = tf.keras.layers.Dropout(0.2, name='dropout_layer')(intermediate_layer)
output_layer = tf.keras.layers.Dense(32, activation='softmax', name='output_layer')(dropout_layer)

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

In [21]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [22]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [23]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=3,
)

In [24]:
# sentiment_model.save('sentiment_model')

# Test

In [25]:
# sentiment_model = tf.keras.models.load_model('sentiment_model')

def test_pred(ids, masks):
    return {
        'input_ids': tf.cast(ids, tf.float64),
        'attention_mask': tf.cast(masks, tf.float64)
    }

preds = sentiment_model.predict(test_pred(X_test_ids, X_test_masks))

In [26]:
import csv

with open('submission.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['', 'pred'])

    idx = 0
    for i in range(len(preds)):
        for j in range(test_group_size['size'][i]):
            writer.writerow([idx, np.argmax(preds[i])])
            idx += 1