In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
uri = "/kaggle/input/transaction-dataset/newdatasett - newdatasett (1).csv"
df = pd.read_csv(uri)

In [3]:
df = df.dropna(subset=['new_category'])

In [4]:
df.head()

Unnamed: 0,text,new_category
0,Arby's,Food
1,Burger King,Food
2,Carl's Jr.,Food
3,Chick-fil-A,Food
4,Chipotle Mexican Grill,Food


# Exploratory Data Analysis

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6685 entries, 0 to 6745
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          6685 non-null   object
 1   new_category  6685 non-null   object
dtypes: object(2)
memory usage: 156.7+ KB


In [6]:
df.nunique()

text            6022
new_category       5
dtype: int64

In [7]:
print(df['new_category'].value_counts())

new_category
Other             2112
Food              1296
Home              1132
Health            1105
Transportation    1040
Name: count, dtype: int64


Additional Information about category label from huggingface df:
- '0': Shopping
- '1': Dining Out
- '2': Entertainment
- '3': Transportation
- '4': Housing
- '5': Payments/Credits
- '6': Utilities
- '7': Service Subscriptions

# Word Embedding Using Bert

In [8]:
from transformers import AutoModel, AutoTokenizer
import torch

bert_model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

In [10]:
from tqdm import tqdm
embeddings = []
batch_size = 32  

for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['text'].iloc[i:i+batch_size].tolist()
    batch_inputs = tokenizer(batch_texts, 
                           return_tensors="pt",
                           truncation=True, 
                           max_length=512,
                           padding=True)
    
    batch_inputs = {name: tensor.to(device) for name, tensor in batch_inputs.items()}
    
    with torch.no_grad():
        outputs = bert_model(**batch_inputs)
    
    batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    embeddings.extend(batch_embeddings)

embeddings = np.array(embeddings)

100%|██████████| 209/209 [00:05<00:00, 41.15it/s]


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels = le.fit_transform(df['new_category'])

In [12]:
print(np.unique(labels))

[0 1 2 3 4]


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = embeddings
y = labels

y = tf.keras.utils.to_categorical(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=labels)

In [14]:
X_train.shape

(5348, 768)

In [15]:
y_train.shape

(5348, 5)

In [16]:
y_train[0]

array([0., 1., 0., 0., 0.])

In [17]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),  # BERT embeddings dimension
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='softmax')  # 5 categories
])

# 3. Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# 5. Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# 6. Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy*100:.2f}%")

Epoch 1/20


I0000 00:00:1733478151.120517     125 service.cc:145] XLA service 0x78fe50005d60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733478151.120577     125 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1733478151.120581     125 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m115/134[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 1ms/step - accuracy: 0.5209 - loss: 1.1997

I0000 00:00:1733478154.708434     125 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 35ms/step - accuracy: 0.5367 - loss: 1.1660 - val_accuracy: 0.7477 - val_loss: 0.6659
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7229 - loss: 0.7526 - val_accuracy: 0.7935 - val_loss: 0.5719
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7669 - loss: 0.6127 - val_accuracy: 0.8028 - val_loss: 0.5360
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7912 - loss: 0.5479 - val_accuracy: 0.8056 - val_loss: 0.5180
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8297 - loss: 0.4865 - val_accuracy: 0.8299 - val_loss: 0.4786
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8409 - loss: 0.4576 - val_accuracy: 0.8140 - val_loss: 0.5167
Epoch 7/20
[1m134/134[0m [32m━━━━━━

In [18]:
model.save("full_model.h5")

In [19]:
# Your training code remains the same until after training
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),  # BERT embeddings dimension
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='softmax')  # 5 categories
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Save the model differently - try these two methods:
# Method 1: Save just the weights
# model.save_weights('model_weights.h5', save_format='h5')

# Method 2: Save the entire model


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.5323 - loss: 1.1693 - val_accuracy: 0.7505 - val_loss: 0.6974
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7481 - loss: 0.7053 - val_accuracy: 0.7925 - val_loss: 0.5801
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7906 - loss: 0.5823 - val_accuracy: 0.7963 - val_loss: 0.5624
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8018 - loss: 0.5252 - val_accuracy: 0.7981 - val_loss: 0.5449
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8132 - loss: 0.4812 - val_accuracy: 0.8056 - val_loss: 0.5328
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8237 - loss: 0.4741 - val_accuracy: 0.8112 - val_loss: 0.5083
Epoch 7/20
[1m134/134[0m 

In [20]:
model.export("final_model")

Saved artifact at 'final_model'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 768), dtype=tf.float32, name='keras_tensor_6')
Output Type:
  TensorSpec(shape=(None, 5), dtype=tf.float32, name=None)
Captures:
  133035508259424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133035508265936: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133035508640656: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133035505504800: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133035505500576: TensorSpec(shape=(), dtype=tf.resource, name=None)
  133035505506560: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [21]:
!zip -r final_model.zip final_model/

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: final_model/ (stored 0%)
  adding: final_model/variables/ (stored 0%)
  adding: final_model/variables/variables.data-00000-of-00001 (deflated 8%)
  adding: final_model/variables/variables.index (deflated 59%)
  adding: final_model/saved_model.pb (deflated 85%)
  adding: final_model/fingerprint.pb (stored 0%)
  adding: final_model/assets/ (stored 0%)


In [22]:
def predict_single_text(text, bert_model, tokenizer, classifier_model, label_encoder):
    # Tokenize the text
    inputs = tokenizer(
        text, 
        return_tensors="pt",
        truncation=True, 
        max_length=512,
        padding=True
    )
    
    # Get BERT embedding
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    with torch.no_grad():
        outputs = bert_model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    # Get prediction
    prediction = classifier_model.predict(embedding)
    predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])[0]
    confidence = np.max(prediction) * 100
    
    return predicted_class, confidence

test_text = "spicy"
predicted_category, confidence = predict_single_text(
    test_text,
    bert_model,
    tokenizer,
    model,
    le
)
print("\nTest Prediction:")
print(f"Text: {test_text}")
print(f"Predicted Category: {predicted_category}")
print(f"Confidence: {confidence:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step

Test Prediction:
Text: spicy
Predicted Category: Food
Confidence: 82.49%


In [23]:
import joblib
joblib.dump(model, 'classifier_model.joblib')
joblib.dump(le, 'label_encoder.joblib')

['label_encoder.joblib']