In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import string
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tqdm.pandas()


In [2]:
data = pd.read_csv('./archive/BooksDataset.csv')

In [3]:
train_data = data[['Title','Category']]
train_data.head()

Unnamed: 0,Title,Category
0,Goat Brothers,"History , General"
1,The Missing Person,"Fiction , General"
2,Don't Eat Your Heart Out Cookbook,"Cooking , Reference"
3,When Your Corporate Umbrella Begins to Leak: A...,
4,Amy Spangler's Breastfeeding : A Parent's Guide,


# Explore data

In [4]:
train_data.isna().sum()

Title           0
Category    26170
dtype: int64

In [5]:
train_data = train_data.dropna(axis=0)

In [6]:
train_data

Unnamed: 0,Title,Category
0,Goat Brothers,"History , General"
1,The Missing Person,"Fiction , General"
2,Don't Eat Your Heart Out Cookbook,"Cooking , Reference"
6,Chicken Soup for the Soul: 101 Stories to Open...,"Self-help , Personal Growth , Self-Esteem"
7,Journey Through Heartsongs,"Poetry , General"
...,...,...
103071,Creating Web Pages Simplified (3-D Visual Series),"Computers , Internet , General"
103072,EVA: The Real Key to Creating Wealth,"Business & Economics , Corporate Finance , Ge..."
103075,The Essentials of Spanish (REA's Language Seri...,"Foreign Language Study , Spanish"
103078,My Land of Israel,"Juvenile Nonfiction , People & Places , Middl..."


# Pré-Process data

In [7]:
eng = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.STOP_WORDS

  _C._set_default_tensor_type(t)


In [8]:
def clean(text):
  tweet = text.lower().strip()
  document = eng(text)

  words = []
  for token in document:
    words.append(token.text)

  words = [word for word in words if word not in stop_words and word not in string.punctuation]
  words = ' '.join([str(element) for element in words])

  return words

In [9]:
text = clean('	Creating Web Pages Simplified (3-D Visual Series)')
text

'\t Creating Web Pages Simplified 3 D Visual Series'

In [10]:
# Step 1: Tokenization
def tokenize_text(text):
    # Use your preferred tokenizer (e.g., SpaCy, NLTK, or a simple whitespace tokenizer)
    tokenizer = get_tokenizer('basic_english')
    tokens = tokenizer(text)
    return tokens

# Step 2: Build Vocabulary
def build_vocab(data, min_freq=1):
    # data is a list of text samples
    def yield_tokens(data):
        for text in data:
            yield tokenize_text(text)

    # Build vocabulary from the tokenized data
    vocab = build_vocab_from_iterator(yield_tokens(data), min_freq=min_freq)
    return vocab

# Step 3: Numericalize Text
def numericalize_text(text, vocab):
    # Convert tokens to numerical indices using the vocabulary
    numericalized_text = [vocab[token] for token in tokenize_text(text)]
    return numericalized_text

# Step 4: Padding Sequences
def pad_sequence(sequence, max_len):
    # Pad or truncate sequences to a fixed length
    if len(sequence) < max_len:
        sequence += [0] * (max_len - len(sequence))
    else:
        sequence = sequence[:max_len]
    return sequence

In [14]:
from sklearn.model_selection import train_test_split
text, _,label , _ = train_test_split(train_data['Title'],train_data['Category'], test_size = 0.85,shuffle=True)

In [21]:
# Step 1: Clean text
text = text.progress_apply(lambda x : clean(x))

100%|██████████| 11536/11536 [00:53<00:00, 216.82it/s]


In [22]:
# Step 2: Tokenization
tokenized_data = text.progress_apply(lambda x : tokenize_text(x))

100%|██████████| 11536/11536 [00:00<00:00, 52348.88it/s]


In [23]:
# Step 3: Build Vocabulary
vocab = build_vocab(text)

In [28]:
# Step 4: Numericalize Text
numericalized_data = text.progress_apply(lambda x: numericalize_text(x, vocab))

100%|██████████| 11536/11536 [00:00<00:00, 68709.97it/s]


In [30]:
# Step 5: Padding Sequences
max_len = max(len(seq) for seq in numericalized_data)
padded_data = [pad_sequence(seq, max_len) for seq in numericalized_data]

In [31]:
# Step 6: Convert to PyTorch tensor
tensor_data = torch.tensor(padded_data)

# pré process label

In [68]:
labels = [clas.split()[0] for clas in label]

In [72]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Step 1: Label Encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [73]:
encoded_labels

array([26,  4,  4, ..., 23, 15, 15], dtype=int64)

# model

In [155]:
class DCNN(nn.Module):
    def __init__(self,num_filters, vocab_size, embedding_dim,hidden_dim, output_dim):
        super(DCNN,self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.bigram = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=2)
        self.trigram = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=3)
        self.fourgram = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=4)
        self.pool = nn.MaxPool1d(2)

        self.dense_1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.last_dense = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
    def forward(self,text):
        x = self.embed(text)
        x_1 = nn.functional.relu(self.bigram(x))
        x_1 = self.pool(x_1)
        x_2 = nn.functional.relu(self.trigram(x_1))
        x_2 = self.pool(x_2)
        x_3 = nn.functional.relu(self.fourgram(x_2))
        x_3 = self.pool(x_3)
        
        merged = torch.cat([x_1, x_2, x_3], dim =1) # (batch_size, 3 * nb_filters)
        x = self.dense_1(merged)
        x = self.relu(x)
        return self.softmax(x)

In [156]:
# Hyperparameters
nb_filters = 128
vocab_size = len(vocab)
embedding_dim = 32
hidden_dim = 64
output_dim = len(set(encoded_labels))

# Create the model
model = DCNN(nb_filters,vocab_size, embedding_dim, hidden_dim, output_dim)


In [157]:
emb = model(tensor_data[0])

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x22 and 32x64)

In [151]:
len(emb[0])

22

In [94]:
nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs) for fs in filter_sizes


48