In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load data from Excel file
def load_data_from_excel(file_path):
    df = pd.read_excel(file_path)  # Read Excel file
    texts = df['text'].tolist()    # Extract text column
    labels = df['label'].tolist()  # Extract label column
    return texts, labels

# Load data
file_path = r"T:\My Drive\000 ihsan\02 Courses\003GIT\Neural Networks and Deep Learning\10.Artificial Neural Networks with Keras\data.xlsx"  # Path to your Excel file
texts, labels = load_data_from_excel(file_path)

# Convert text labels to numeric labels , LabelEncoder() is a class from sklearn.preprocessing
label_encoder = LabelEncoder() 
# Converts the numerical labels into a PyTorch tensor with long data type 
y = torch.tensor(label_encoder.fit_transform(labels), dtype=torch.long)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X = torch.tensor(vectorizer.fit_transform(texts).toarray(), dtype=torch.float32)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define MLP Model
class MLPTextClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=16):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.5),  # Add dropout for regularization
            nn.Linear(hidden_size, len(label_encoder.classes_))  # Output classes dynamically
        )

    def forward(self, x):
        return self.model(x)

# Initialize model
model = MLPTextClassifier(X.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(100):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# Evaluate
with torch.no_grad():
    y_pred = torch.argmax(model(X_test), dim=1)
    accuracy = accuracy_score(y_test.numpy(), y_pred.numpy())
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Predict new text with confidence threshold
def predict(text, confidence_threshold=0.7):
    X_new = torch.tensor(vectorizer.transform([text]).toarray(), dtype=torch.float32)
    logits = model(X_new)
    probabilities = torch.softmax(logits, dim=1)
    confidence, pred = torch.max(probabilities, dim=1)
    if confidence.item() < confidence_threshold:
        return "Unknown"
    return label_encoder.inverse_transform([pred.item()])[0]

# Example predictions
print(predict("petrol for grass cutting"))  # Should output "GVIC Cleaning Expenses"
print(predict("random "))  # Should output "Unknown" if confidence is low

Epoch 0, Loss: 1.2448
Epoch 10, Loss: 0.9023
Epoch 20, Loss: 0.7704
Epoch 30, Loss: 0.5674
Epoch 40, Loss: 0.3616
Epoch 50, Loss: 0.1411
Epoch 60, Loss: 0.0911
Epoch 70, Loss: 0.2316
Epoch 80, Loss: 0.7164
Epoch 90, Loss: 0.1879
Test Accuracy: 0.00%
GVIC Cleaning Expenses
GVAC Academic Program expense


In [2]:
texts, labels

(['Remuneration program',
  'Inter net bill',
  'Refreshment MIlk,Snaks&Nuts',
  'petrol for grass cutting'],
 ['GVAC Academic Program expense',
  'GVAC Mobile and internet expenses',
  'GVAC Refreshment Expense',
  'GVIC Cleaning Expenses'])