In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/prometeo23-kaggle/train_absa.csv
/kaggle/input/prometeo23-kaggle/sample.csv
/kaggle/input/prometeo23-kaggle/test_absa.csv


In [2]:
import sys
!{sys.executable} -m pip install tensorflow-addons
import tensorflow_addons as tfa

[0m

In [3]:
#general purpose packages
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
import emoji
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

#keras
import tensorflow as tf
from tensorflow import keras


#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

#set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)


<Figure size 432x288 with 0 Axes>

In [4]:
df = pd.read_csv('/kaggle/input/prometeo23-kaggle/train_absa.csv')
df_test = pd.read_csv('/kaggle/input/prometeo23-kaggle/test_absa.csv')

In [5]:
def conf_matrix(y, y_pred, title):
    fig, ax =plt.subplots(figsize=(5,5))
    labels=['Negative', 'Neutral', 'Positive']
    ax=sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap="Blues", fmt='g', cbar=False, annot_kws={"size":25})
    plt.title(title, fontsize=20)
    ax.xaxis.set_ticklabels(labels, fontsize=17) 
    ax.yaxis.set_ticklabels(labels, fontsize=17)
    ax.set_ylabel('Test', fontsize=20)
    ax.set_xlabel('Predicted', fontsize=20)
    plt.show()

In [6]:
tokenizer_roberta = RobertaTokenizerFast.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [7]:
X=df['text'].values
y=df['label'].values


In [8]:
x=df_test['text'].values

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, stratify=y, random_state=seed)

In [10]:
X_train

array(['but when filtering, notion still thinks the relation exists.',
       'please add abilty to use fractions in the number property.',
       'i have the s6 lite and I am struggling with doing simple things like deleting lines as the menu and keyboard',
       ...,
       'the app itself is amazing and extremely useful, probably the best note taking app right now.',
       '@simonw @joshu move those recipes over to @notionhq, @evernote has gotten horrible over the past couple years.',
       'i love notion as a whole but the quality of the app, ease of use needs to significantly improve.'],
      dtype=object)

In [11]:
y_train_le = y_train.copy()
y_valid_le = y_valid.copy()
#y_test_le = y_test.copy()

In [12]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = ohe.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()

In [13]:
token_lens = []

for txt in X_train:
    tokens = tokenizer_roberta.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)
max_length

333

In [14]:
MAX_LEN=333

In [15]:
def encode_examples(ds, limit=-1):
    # Prepare Input list
    input_ids_list = []
    attention_mask_list = []
    label_list = []

    if (limit > 0):
        ds = ds.take(limit)

    for review, label in tfds.as_numpy(ds):
        bert_input = convert_example_to_feature(review.decode())
        input_ids_list.append(bert_input['input_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])

    return tf.data.Dataset.from_tensor_slices((input_ids_list,
                                               attention_mask_list,
                                               label_list)).map(map_example_to_dict)

In [16]:
def tokenize_roberta(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [17]:
train_input_ids, train_attention_masks = tokenize_roberta(X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize_roberta(X_valid, MAX_LEN)
test_input_ids, test_attention_masks = tokenize_roberta(df_test['text'].values, MAX_LEN)

In [18]:
train_input_ids.shape

(3150, 333)

In [19]:
test_input_ids.shape


(500, 333)

In [20]:
test_input_ids, test_attention_masks = tokenize_roberta(x, MAX_LEN)

In [21]:
metric1=tfa.metrics.F1Score(num_classes=3,threshold=0.5)
metric2=tfa.metrics.FBetaScore(num_classes=3,threshold=0.5,beta=2.0)

In [22]:
def create_model(bert_model, max_len=MAX_LEN):
    
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=0.05)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(3, activation=tf.nn.softmax)(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(opt, loss=loss, metrics = [metric1,metric2])
    return model

In [23]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/627M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [24]:
model = create_model(roberta_model, MAX_LEN)
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 333)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 333)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 124645632   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 3)            2307        tf_roberta_model[0][1]       

In [25]:
history_2 = model.fit([train_input_ids,train_attention_masks], y_train, validation_data=([val_input_ids,val_attention_masks], y_valid), epochs=6, batch_size=4)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [26]:
model.save('/kaggle/working/roberta.h5')

In [27]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
from tensorflow.keras.models import load_model


In [29]:
tf.keras.utils.custom_object_scope

keras.utils.generic_utils.CustomObjectScope

In [30]:
# results=list()
sentiment = model.predict([test_input_ids,test_attention_masks],batch_size=1,verbose = 2)

# results.append(np.argmax(sentiment))

500/500 - 11s


In [31]:
sentiment

array([[0.33993492, 0.4857308 , 0.17433423],
       [0.49560902, 0.26559886, 0.2387921 ],
       [0.43300033, 0.27980173, 0.287198  ],
       ...,
       [0.3985415 , 0.40416217, 0.19729638],
       [0.55648446, 0.22199453, 0.22152103],
       [0.42193648, 0.2803675 , 0.29769602]], dtype=float32)

In [32]:
results=list()

In [33]:
results = list()
for i in range(0,len(sentiment)):
    a=sentiment[i]
    results.append(np.argmax(a))
#    if(np.argmax(sentiment) == 0):
#        print("negative")
#    elif (np.argmax(sentiment) == 1):
#        print("neutral")
#    elif(np.argmax(sentiment) == 2):
#        print("positive")

In [35]:
print(a)

[0.42193648 0.2803675  0.29769602]


In [36]:
df = pd.DataFrame(columns = ['Id','Predicted'])

df['Predicted'] = results

for i in range(0,len(results)) :
  df['Id'][i]=(i)

df.to_csv('/kaggle/working/sub.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
