In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('data.json') as f:
    data = json.load(f)

In [3]:
df = pd.DataFrame(data)

In [4]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [5]:
# Function to convert a description to its BERT embedding
def description_to_bert_embedding(description):
    inputs = tokenizer(description, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

In [6]:
# Convert descriptions to BERT embeddings
X = np.array([description_to_bert_embedding(desc) for desc in df["description"]])
X = X.reshape(X.shape[0], -1)  # Flatten the embeddings
y = df["category"]

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Define the parameter grid for Random Forest
param_grid = {
    "n_estimators": [50, 100, 200],  # Number of trees in the forest
    "max_depth": [None, 10, 20, 30],  # Maximum depth of the tree
    "min_samples_split": [2, 5, 10],  # Minimum number of samples required to split a node
    "min_samples_leaf": [1, 2, 4],  # Minimum number of samples required at each leaf node
    "max_features": ["sqrt", "log2"],  # Number of features to consider at each split
}

In [9]:
# Create the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring="accuracy",  # Metric to optimize
    n_jobs=-1,  # Use all available CPU cores
    verbose=2,  # Print progress
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model with the best hyperparameters
rf_model = grid_search.best_estimator_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, max_f

In [10]:
# Evaluate
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

      Clothing       1.00      1.00      1.00         2
        Dining       1.00      1.00      1.00         4
     Education       1.00      1.00      1.00         1
 Entertainment       1.00      1.00      1.00         2
     Groceries       0.75      1.00      0.86         3
        Health       0.00      0.00      0.00         1
Other Expenses       0.67      1.00      0.80         2
 Rent/Mortgage       1.00      1.00      1.00         2
Transportation       1.00      0.67      0.80         3

      accuracy                           0.90        20
     macro avg       0.82      0.85      0.83        20
  weighted avg       0.88      0.90      0.88        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 90.00%


In [12]:
import joblib

# Save the Random Forest model
joblib.dump(rf_model, 'random_forest_model.pkl')

# Save the BERT tokenizer and model
tokenizer.save_pretrained('bert_tokenizer')
model.save_pretrained('bert_model')

In [13]:
# Predict new description
new_description = "pathao fare"
new_description_embedding = description_to_bert_embedding(new_description)
predicted_category = rf_model.predict(new_description_embedding)
print(f"Predicted Category: {predicted_category[0]}")

Predicted Category: Transportation
