In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import requests

url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml"
response=requests.get(url)

if response.status_code==200:
    with open("index.xml","w",encoding="utf-8") as file:
        file.write(response.text)
    print("Download successful: index.xml saved.")
else:
    print(f"Failed to download. HTTP Status Code: {response.status_code}")

Download successful: index.xml saved.


In [2]:
import gc
import tensorflow as tf
gc.collect()
tf.keras.backend.clear_session()

In [3]:
import tensorflow as tf
from tensorflow.keras.mixed_precision import Policy,set_global_policy
policy=Policy('mixed_float16')
set_global_policy(policy)

In [4]:
import os
import nltk
import zipfile

wordnet_path = "/usr/share/nltk_data/corpora/wordnet.zip"
wordnet_dir = "/usr/share/nltk_data/corpora/wordnet"

# Unzipping the WordNet data to the target directory
if not os.path.exists(wordnet_dir):
    with zipfile.ZipFile(wordnet_path, 'r') as z:
        z.extractall("/usr/share/nltk_data/corpora/")

nltk.data.path.append("/usr/share/nltk_data/")

In [5]:
import numpy as np
import pandas as pd
data=pd.read_csv("/kaggle/input/tiktoken-happydb/cleaned_hm.csv")

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

stop_words=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

def preprocess_text(text):
    text=re.sub(r'<.*?>', '', text)
    text=text.lower()
    text=re.sub(r'[^a-z\s]', '', text)
    tokens=word_tokenize(text)
    tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
data['cleaned_hm']=data['cleaned_hm'].apply(preprocess_text)

In [8]:
import os
os.environ["KERAS_BACKEND"]="jax"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

In [9]:
from sklearn.model_selection import train_test_split
X=data['cleaned_hm']  
y=data['predicted_category']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42,stratify=y)

In [10]:
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.10,random_state=42,stratify=y_train)

In [11]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
y_train_encoded=label_encoder.fit_transform(y_train)
y_test_encoded=label_encoder.transform(y_test)
y_val_encoded=label_encoder.transform(y_val)

In [12]:
import tensorflow as tf
from transformers import DistilBertTokenizer

In [13]:
tokenizer=DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [14]:
train_encodings=tokenizer(list(X_train),truncation=True,padding=True,max_length=128)
val_encodings=tokenizer(list(X_val),truncation=True,padding=True,max_length=128)
test_encodings=tokenizer(list(X_test),truncation=True,padding=True,max_length=128)

In [15]:
batch_size=16

train_dataset=tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_encodings["input_ids"],"attention_mask": train_encodings["attention_mask"]}, 
    y_train_encoded
)).batch(batch_size)

val_dataset=tf.data.Dataset.from_tensor_slices((
    {"input_ids": val_encodings["input_ids"],"attention_mask": val_encodings["attention_mask"]}, 
    y_val_encoded
)).batch(batch_size)

test_dataset=tf.data.Dataset.from_tensor_slices((
    {"input_ids": test_encodings["input_ids"],"attention_mask": test_encodings["attention_mask"]}, 
    y_test_encoded

)).batch(batch_size)

In [16]:
from transformers import DistilBertTokenizer,TFDistilBertForSequenceClassification
num_classes=7
model=TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",num_labels=num_classes
)
model.summary()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  5383      
                                                                 
 dropout_19 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 66958855 (255.43 MB)
Trainable params: 66958855 (255.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
from transformers import AdamWeightDecay
optimizer=AdamWeightDecay(learning_rate=5e-5,weight_decay_rate=0.01)
model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=["accuracy"])

In [18]:
best_val_loss=float('inf')
patience_counter=0

for epoch in range(5):
    print(f"\nEpoch {epoch + 1}/{5}")
    model.fit(train_dataset, epochs=1)
    
    val_loss,val_accuracy = model.evaluate(val_dataset)  
    
    if val_loss < best_val_loss:  
        best_val_loss = val_loss
        model.save_weights("best_model_weights.h5")
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= 5:
        print("\nEarly stopping triggered.")
        break

model.load_weights("best_model_weights.h5")


Epoch 1/5

Epoch 2/5

Epoch 3/5

Epoch 4/5

Epoch 5/5


In [19]:
results=model.evaluate(test_dataset)
print("Test Loss:",results[0])
print("Test Accuracy:",results[1])

Test Loss: 0.33725854754447937
Test Accuracy: 0.8780138492584229


In [20]:
results1=model.evaluate(val_dataset)
print("Validation Loss:",results1[0])
print("Validation Accuracy:",results1[1])

Validation Loss: 0.34555062651634216
Validation Accuracy: 0.8768067955970764


In [21]:
from sklearn.metrics import classification_report
y_pred_logits=model.predict(test_dataset)["logits"]
y_pred_classes=np.argmax(y_pred_logits,axis=1)
print(classification_report(y_test_encoded,y_pred_classes,target_names=label_encoder.classes_))

                  precision    recall  f1-score   support

     achievement       0.90      0.87      0.89      8498
       affection       0.95      0.92      0.94      8542
         bonding       0.90      0.95      0.93      2682
enjoy_the_moment       0.66      0.81      0.73      2786
        exercise       0.76      0.88      0.82       300
         leisure       0.84      0.73      0.78      1865
          nature       0.83      0.66      0.73       461

        accuracy                           0.88     25134
       macro avg       0.83      0.83      0.83     25134
    weighted avg       0.88      0.88      0.88     25134



In [22]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

accuracy=accuracy_score(y_test_encoded,y_pred_classes)
precision=precision_score(y_test_encoded,y_pred_classes,average='weighted')
recall=recall_score(y_test_encoded,y_pred_classes,average='weighted')
f1=f1_score(y_test_encoded,y_pred_classes,average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8780
Precision : 0.8839
Recall: 0.8780
F1 Score: 0.8794
