In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
os.environ["KERAS_BACKEND"]="tensorflow"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"

In [2]:
import requests

url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml"
response=requests.get(url)

if response.status_code==200:
    with open("index.xml","w",encoding="utf-8") as file:
        file.write(response.text)
    print("Download successful: index.xml saved.")
else:
    print(f"Failed to download. HTTP Status Code: {response.status_code}")

Download successful: index.xml saved.


In [3]:
import gc
import tensorflow as tf
gc.collect()
tf.keras.backend.clear_session()

In [4]:
import tensorflow as tf
from tensorflow.keras.mixed_precision import Policy,set_global_policy
tf.config.optimizer.set_jit(True)
policy=Policy('mixed_float16')
set_global_policy(policy)

In [5]:
import os
import nltk
import zipfile

wordnet_path = "/usr/share/nltk_data/corpora/wordnet.zip"
wordnet_dir = "/usr/share/nltk_data/corpora/wordnet"

# Unzipping the WordNet data to the target directory
if not os.path.exists(wordnet_dir):
    with zipfile.ZipFile(wordnet_path, 'r') as z:
        z.extractall("/usr/share/nltk_data/corpora/")

nltk.data.path.append("/usr/share/nltk_data/")

In [6]:
import numpy as np
import pandas as pd
data=pd.read_csv("/kaggle/input/tiktoken-happydb/cleaned_hm.csv")

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
data['cleaned_hm']=data['cleaned_hm'].apply(preprocess_text)

In [9]:
from sklearn.preprocessing import LabelEncoder

reflection_encoder = LabelEncoder()
data['reflection_period_encoded'] = reflection_encoder.fit_transform(data['reflection_period'])

category_encoder = LabelEncoder()
data['ground_truth_category_encoded'] = category_encoder.fit_transform(data['ground_truth_category'])
data['predicted_category_encoded'] = category_encoder.fit_transform(data['predicted_category'])

encoded_columns = {
    "reflection_period_encoded": data['reflection_period_encoded'].unique(),
    "ground_truth_category_encoded": data['ground_truth_category_encoded'].unique(),
    "predicted_category_encoded": data['predicted_category_encoded'].unique(),
}
encoded_columns

{'reflection_period_encoded': array([0, 1]),
 'ground_truth_category_encoded': array([7, 2, 5, 1, 3, 0, 6, 4]),
 'predicted_category_encoded': array([1, 4, 2, 5, 0, 3, 6])}

In [10]:
from sklearn.model_selection import train_test_split
X = data['cleaned_hm']  
y = data['predicted_category']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25,random_state=42,stratify=y)

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=42,stratify=y_train)

In [12]:
import tensorflow as tf
from transformers import RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [13]:
tokenizer=RobertaTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [14]:
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

In [15]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)

In [16]:
batch_size=16

train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_encodings["input_ids"],"attention_mask": train_encodings["attention_mask"]}, 
    y_train_encoded
)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": val_encodings["input_ids"],"attention_mask": val_encodings["attention_mask"]}, 
    y_val_encoded
)).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": test_encodings["input_ids"],"attention_mask": test_encodings["attention_mask"]}, 
    y_test_encoded
)).batch(batch_size)

In [17]:
from transformers import RobertaTokenizer,TFRobertaForSequenceClassification
num_classes = 7
model = TFRobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=num_classes
)
model.summary()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  124055040 
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  595975    
 ificationHead)                                                  
                                                                 
Total params: 124651015 (475.51 MB)
Trainable params: 124651015 (475.51 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
from transformers import AdamWeightDecay
optimizer=AdamWeightDecay(learning_rate=5e-5,weight_decay_rate=0.01)
model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=["accuracy"],jit_compile=True)

In [19]:
best_val_loss=float('inf')
patience_counter=0

for epoch in range(5):
    print(f"\nEpoch {epoch + 1}/{5}")
    model.fit(train_dataset, epochs=1)
    
    val_loss,val_accuracy = model.evaluate(val_dataset)  
    
    if val_loss < best_val_loss:  
        best_val_loss = val_loss
        model.save_weights("best_model_weights.h5")
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= 5:
        print("\nEarly stopping triggered.")
        break

model.load_weights("best_model_weights.h5")


Epoch 1/5
Cause: for/else statement not yet supported

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5


In [20]:
results=model.evaluate(test_dataset)
print("Test Loss:",results[0])
print("Test Accuracy:",results[1])

Test Loss: 0.3611977994441986
Test Accuracy: 0.8753879070281982


In [21]:
results1=model.evaluate(val_dataset)
print("Train Loss:",results1[0])
print("Train Accuracy:",results1[1])

Train Loss: 0.3590981364250183
Train Accuracy: 0.8778676390647888


In [22]:
from sklearn.metrics import classification_report
y_pred_logits=model.predict(test_dataset)["logits"]
y_pred_classes=np.argmax(y_pred_logits,axis=1)
print(classification_report(y_test_encoded,y_pred_classes,target_names=label_encoder.classes_))

                  precision    recall  f1-score   support

     achievement       0.88      0.88      0.88      8498
       affection       0.95      0.93      0.94      8542
         bonding       0.91      0.94      0.92      2682
enjoy_the_moment       0.70      0.75      0.73      2786
        exercise       0.71      0.91      0.80       300
         leisure       0.80      0.73      0.77      1865
          nature       0.70      0.80      0.75       461

        accuracy                           0.88     25134
       macro avg       0.81      0.85      0.83     25134
    weighted avg       0.88      0.88      0.88     25134



In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score

accuracy=accuracy_score(y_test_encoded,y_pred_classes)
precision=precision_score(y_test_encoded,y_pred_classes,average='weighted')
recall=recall_score(y_test_encoded,y_pred_classes,average='weighted')
f1=f1_score(y_test_encoded,y_pred_classes,average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8754
Precision : 0.8777
Recall: 0.8754
F1 Score: 0.8760
