In [46]:
import re
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import spacy
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
from sklearn.model_selection import train_test_split

# Set script parameters
SEED = 42 # Set for reproducibility
MAX_SEQ_LENGTH = 512  # Maximum sequence length
BATCH_SIZE = 8
spacy_en = spacy.load('en_core_web_sm')
stop_words = nltk.corpus.stopwords.words('english')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(SEED)
# Set random seed for reproducibility on GPU
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

**Example Pytorch NN Classifier Script, built using:**
- https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
- https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/
- https://medium.com/@spandey8312/text-classification-using-custom-data-and-pytorch-d88ba1087045
- https://www.deeplearningwizard.com/deep_learning/intro/

**Ideas for improvement:**
- Use a different tokenizer
- Pytorch (use a different optimizer, scheduler, loss function, learning rate, epochs, batch size, etc.)
- Models (adjust model parameter, layer, activation function, etc.)

# Load Data

In [21]:
df_cleantech = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_cleantech_granted_abstract_metadata.json')
df_cleantech['label'] = 1
df_non_cleantech = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_non_cleantech_granted_abstract_metadata.json')
df_non_cleantech['label'] = 0
df_cleantech = df_cleantech[df_cleantech['appln_abstract'] != '']
df_non_cleantech = df_non_cleantech[df_non_cleantech['appln_abstract'] != '']
df_cleantech.dropna(inplace=True)
df_non_cleantech.dropna(inplace=True)

In [22]:
# df = pd.concat([df_cleantech, df_non_cleantech], ignore_index=True)
df = pd.concat([df_cleantech.sample(50000, random_state=42), df_non_cleantech.sample(50000, random_state=42)], ignore_index=True)
df = df[['appln_id', 'appln_abstract', 'label']]

# Data Cleaning

In [23]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [24]:
df['appln_abstract'] = df['appln_abstract'].astype(str)
df['appln_abstract'] = df['appln_abstract'].progress_apply(preprocess_text)

100%|██████████| 100000/100000 [00:26<00:00, 3746.61it/s]


# Data Prepraration

## Tokenization

In [25]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
df['appln_abstract_tokens'] = df['appln_abstract'].progress_apply(lambda x: tokenizer(x))

100%|██████████| 100000/100000 [00:16<00:00, 6100.77it/s]


## Build Vocabulary

In [26]:
vocab = build_vocab_from_iterator(df['appln_abstract_tokens'].progress_apply(lambda x: x), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

100%|██████████| 100000/100000 [00:00<00:00, 1280621.40it/s]


In [27]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

## Create Batches and DataLoader

In [None]:
# Function to collate data into batches for FeedForward Neural Network and Convolutional Neural Network
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>'])
    return label_list.to(device), text_list.to(device)

In [37]:
# Function to collate data into batches for Recurrent Neural Network and Long Short-Term Memory Neural Network
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_text, _label) in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        
        # Only add samples where the length is greater than 0
        if len(processed_text) > 0:
            label_list.append(_label)
            text_list.append(processed_text)
            lengths.append(len(processed_text))
    
    # Proceed only if there are samples with non-zero length
    if lengths:
        label_list = torch.tensor(label_list, dtype=torch.int64)
        text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>'])
        lengths = torch.tensor(lengths)
        return label_list.to(device), text_list.to(device), lengths
    else:
        # Return None or appropriate default values if all samples had length 0
        return None, None, None

In [38]:
df = df.rename(columns={'appln_abstract': 'text'})
df_torch = df[['text', 'label']].reset_index(drop=True)

In [39]:
class TextClassificationDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        return tuple(self.df.iloc[index])

In [40]:
train_iter, test_iter = train_test_split(df_torch, test_size=0.1, random_state=42)
train_iter, val_iter = train_test_split(train_iter, test_size=0.1, random_state=42)
train_iter = TextClassificationDataset(train_iter.reset_index(drop=True))
test_iter = TextClassificationDataset(test_iter.reset_index(drop=True))
val_iter = TextClassificationDataset(val_iter.reset_index(drop=True))
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
val_dataset = to_map_style_dataset(val_iter)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

# Define NN Architecture

In [None]:
class FNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(FNN, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc1 = nn.Linear(embed_dim, 128)
        self.fc2 = nn.Linear(128, num_class)

    def forward(self, text):
        embedded = self.embedding(text)
        x = F.relu(self.fc1(embedded))
        return self.fc2(x)
# Might also introduce other non-linearities such as Tanh or Sigmoid
# Could introduce Dropout layers to prevent overfitting
# Could introduce more layers or increase the number of neurons in each layer (embed_dim)

In [41]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_class)

    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed_input = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed_input)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        final_feature_map = output[torch.arange(output.size(0)), lengths - 1]

        return self.fc(final_feature_map)

In [47]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_class)

    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed_input = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_input)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        final_feature_map = output[torch.arange(output.size(0)), lengths - 1]

        return self.fc(final_feature_map)

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, num_class):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embed_dim)) for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_class)

    def forward(self, text):
        embedded = self.embedding(text).unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = torch.cat(pooled, dim=1)
        return self.fc(cat)

# Train Model

In [48]:
# Instantiate the model
# model = FNN(len(vocab), 256, 1)
# model = CNN(len(vocab), 256, 100, [3, 4, 5], 1) # Filter sizes correspond to trigrams, 4-grams and 5-grams, Number of filters corresponds to number of patterns per each n-gram
# model = RNN(len(vocab), 256, 128, 1)
model = LSTM(len(vocab), 256, 128, 1)

EPOCHS = 10
LR = 0.1

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
# optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

model = model.to(device)

## Training Function

In [None]:
# Training Loop for FeedForward Neural Network and Convolutional Neural Network
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 1000
    for idx, (label, text) in enumerate(dataloader):
        label, text = label.to(device), text.to(device)
        optimizer.zero_grad()
        predicted_logits = model(text) 
        loss = criterion(predicted_logits.squeeze(1), label.float()) # For FNN
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        predicted_labels = (torch.sigmoid(predicted_logits) > 0.5).long()
        total_acc += (predicted_labels.squeeze(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            print('Epoch: {:03d} | Batch: {:03d}/{:03d} | Loss: {:03f} | Accuracy: {:.3f}'.format(epoch, idx, len(dataloader), loss.item(), total_acc/total_count))
            total_acc, total_count = 0, 0

In [49]:
# Training Loop for Recurrent Neural Network and Long Short-Term Memory Neural Network
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 1000
    for idx, (label, text, lengths) in enumerate(dataloader):
        label, text = label.to(device), text.to(device)
        optimizer.zero_grad()
        predicted_logits = model(text, lengths)
        loss = criterion(predicted_logits.squeeze(1), label.float())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        predicted_labels = (torch.sigmoid(predicted_logits) > 0.5).long()
        total_acc += (predicted_labels.squeeze(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            print('Epoch: {:03d} | Batch: {:03d}/{:03d} | Loss: {:03f} | Accuracy: {:.3f}'.format(epoch, idx, len(dataloader), loss.item(), total_acc/total_count))
            total_acc, total_count = 0, 0

## Evaluation Function

In [None]:
# Evaluation Loop for FeedForward Neural Network and Convolutional Neural Network
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            label, text = label.to(device), text.to(device)
            predicted_logits = model(text)
            loss = criterion(predicted_logits.squeeze(1), label.float()) # For FNN
            predicted_labels = (torch.sigmoid(predicted_logits) > 0.5).long()
            total_acc += (predicted_labels.squeeze(1) == label).sum().item()
            total_count += label.size(0)
    
        print('End of epoch: {:03d} | Loss: {:03f} | Accuracy: {:.3f}'.format(epoch, loss.item(), total_acc/total_count))
        return loss.item(), total_acc/total_count

In [50]:
# Evaluation Loop for Recurrent Neural Network and Long Short-Term Memory Neural Network
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (label, text, lengths) in enumerate(dataloader):
            label, text = label.to(device), text.to(device)
            predicted_logits = model(text, lengths)
            loss = criterion(predicted_logits.squeeze(1), label.float())
            predicted_labels = (torch.sigmoid(predicted_logits) > 0.5).long()
            total_acc += (predicted_labels.squeeze(1) == label).sum().item()
            total_count += label.size(0)
    
        print('End of epoch: {:03d} | Loss: {:03f} | Accuracy: {:.3f}'.format(epoch, loss.item(), total_acc/total_count))
        return loss.item(), total_acc/total_count

## Training Loop

In [51]:
for epoch in range(1, EPOCHS + 1):
    train(train_dataloader)
    print("-"*60)
    accu_val = evaluate(val_dataloader)
    print("-"*60)
    scheduler.step()

Epoch: 001 | Batch: 1000/10125 | Loss: 0.760616 | Accuracy: 0.506
Epoch: 001 | Batch: 2000/10125 | Loss: 1.405729 | Accuracy: 0.507
Epoch: 001 | Batch: 3000/10125 | Loss: 0.858329 | Accuracy: 0.505
Epoch: 001 | Batch: 4000/10125 | Loss: 0.747641 | Accuracy: 0.509
Epoch: 001 | Batch: 5000/10125 | Loss: 0.641245 | Accuracy: 0.510
Epoch: 001 | Batch: 6000/10125 | Loss: 0.931087 | Accuracy: 0.520
Epoch: 001 | Batch: 7000/10125 | Loss: 1.183208 | Accuracy: 0.515
Epoch: 001 | Batch: 8000/10125 | Loss: 1.720895 | Accuracy: 0.525
Epoch: 001 | Batch: 9000/10125 | Loss: 0.760823 | Accuracy: 0.512
Epoch: 001 | Batch: 10000/10125 | Loss: 0.912768 | Accuracy: 0.501
------------------------------------------------------------
End of epoch: 001 | Loss: 1.254563 | Accuracy: 0.506
------------------------------------------------------------
Epoch: 002 | Batch: 1000/10125 | Loss: 0.639118 | Accuracy: 0.515
Epoch: 002 | Batch: 2000/10125 | Loss: 0.743480 | Accuracy: 0.528
Epoch: 002 | Batch: 3000/10125 |