In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('/Users/jiteshdewangan/Downloads/training.tsv', sep='\t')

In [3]:
df = data.sample(frac=.01, random_state=50)

df.shape

(12000, 3)

In [4]:
df.isna().sum()
df.dropna(inplace=True)
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [5]:
df['text'] = df['title'] + " " + df['description']

In [6]:
df['tokens'] = df['text'].apply(lambda x: x.lower().split())

In [7]:
X = df['tokens']
y = df['category']

In [8]:
print(X.shape)
print(y.shape)

(12000,)
(12000,)


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiteshdewangan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

In [12]:
X_train = X_train.apply(remove_stopwords)
X_test = X_test.apply(remove_stopwords)


In [27]:
X_train

185878     [seat, pan, assm, short, 1214w, zgs, numotion,...
604669     [t&m, services, engagement, -, accounting, sys...
1127937    [tee, time:, 1251033887, :, greens, fee, -, co...
800572     [black, /, s/p, line, item:, 1231512895581, va...
968979     [bvi-0627172, optional, maintenance, agreement...
                                 ...                        
691422     [04/08/2018, -, spain_lead_generation_5, maili...
243910     [philo, (order, #, 154711554), movies, -, stre...
701911     [medium, line, item:, 4127969116205, variant:,...
267929     [hbo, monthly, (order, #, 199065334), movies, ...
917767     [uktv, (order, #, 120280811), movies, -, strea...
Name: tokens, Length: 9600, dtype: object

In [13]:
tokenizedtext = X_train.tolist()

In [25]:
print(tokenizedtext)

[['seat', 'pan', 'assm', 'short', '1214w', 'zgs', 'numotion', '-', 'baltimore', 'mobility-enhancing', 'equipment', '-', 'prescription'], ['t&m', 'services', 'engagement', '-', 'accounting', 'system', 'uses', 'blended', 'rates.', 'please', 'see', 'detailed', 'professional', 'services', 'invoices', 'questions.', '-', 'tjhong,', 'sandoz;shin,', 'junghun', 'computer', 'software', 'implementation', '-', 'prewritten', 'software', '-', 'electronically', 'downloaded'], ['tee', 'time:', '1251033887', ':', 'greens', 'fee', '-', 'composite', 'admissions', '-', 'green', 'fees', 'privately', 'owned', 'golf', 'course'], ['black', '/', 's/p', 'line', 'item:', '1231512895581', 'variant:', '8111871688797', 'clothing', '&', 'related', 'products', '(b2c)', '-', 'general'], ['bvi-0627172', 'optional', 'maintenance', 'agreements', 'related', 'sale', 'tangible', 'personal', 'property'], ['zmartbit-ci-14-z7nzge8yuiwg0qn', 'cloud', 'services', '-', 'saas', '-', 'service', 'agreement', '-', 'illinois', 'purpos

In [26]:
from gensim.models import Word2Vec
word2vec_model = Word2Vec(sentences=tokenizedtext,
                           vector_size=100, window=5, min_count=1, workers=4)

In [60]:
# Function to convert tokens to vectors
import numpy as np
def tokens_to_vectors(tokens, model, vector_dim=100, max_length=20):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    # Pad or truncate to fixed length
    if len(vectors) < max_length:
        vectors += [np.zeros(vector_dim)] * (max_length - len(vectors))
    else:
        vectors = vectors[:max_length]
    return np.array(vectors)

In [62]:
max_length = 20  # Maximum sequence length
train_data = X_train.apply(lambda x: tokens_to_vectors(x,word2vec_model, max_length=max_length))
test_data = X_test.apply(lambda x: tokens_to_vectors(x,word2vec_model, max_length=max_length))

In [64]:
train_data.sample(5)

12594      [[-0.0040963138453662395, 0.008078856393694878...
836607     [[-0.0835082158446312, 0.30049842596054077, 0....
478333     [[-0.10615251958370209, 0.3270508944988251, 0....
1123983    [[0.18592992424964905, 0.05961569398641586, 0....
47243      [[0.001862774253822863, 0.016737179830670357, ...
Name: tokens, dtype: object

In [72]:
from sklearn import preprocessing
l = preprocessing.LabelEncoder()
y_train = l.fit_transform(y_train)
y_test = l.transform(y_test)

In [74]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
class TextDataset(Dataset):
    def __init__(self, vectors, targets):
        self.vectors = torch.tensor(np.stack(vectors), dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.long)
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.vectors[idx], self.targets[idx]

In [75]:
# Create datasets and dataloaders
train_dataset = TextDataset(train_data, y_train)
test_dataset = TextDataset(test_data, y_test)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [81]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return out

# Model configuration
input_dim = 100  # Embedding dimension
hidden_dim = 64  # LSTM hidden layer size
output_dim = 2   # Number of classes
num_epochs = 1
learning_rate = 0.001


In [82]:
# Initialize model, loss, and optimizer
model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [83]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for vectors, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(vectors)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/1, Loss: 0.1095


In [84]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for vectors, targets in test_loader:
        outputs = model(vectors)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9708
