In [38]:
import pandas as pd
import spacy
import torch
from torch import nn
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from torchviz import make_dot

# Load Data

In [None]:
df_cleantech = pd.read_json('data/cleantech.json')
df_cleantech['label'] = 1
df_non_cleantech = pd.read_json('data/non_cleantech.json')
df_non_cleantech['label'] = 0
df = pd.concat([df_cleantech, df_non_cleantech], ignore_index=True)
# Drop all columns except APPLN_ID and APPLN_ABSTRACT
df.drop(df.columns.difference(['APPLN_ID', 'APPLN_ABSTRACT']), 1, inplace=True)

In [9]:
# Create small examples for testing
test_text = "This is a test sentence. This is another test sentence."
test_text2 = "This is a test sentence. This is another test sentence. This is a third test sentence."

df_cleantech = pd.DataFrame({'APPLN_ID': [1], 'APPLN_ABSTRACT': [test_text], 'label': [1]})
df_non_cleantech = pd.DataFrame({'APPLN_ID': [2], 'APPLN_ABSTRACT': [test_text2], 'label': [0]})

df = pd.concat([df_cleantech, df_non_cleantech], ignore_index=True)

# Initialize Torch Model

In [30]:
tokenizer = spacy.load('en_core_web_sm')

abstract_list = df['APPLN_ABSTRACT'].tolist()

def yield_tokens(data_iter):
    for data_sample in data_iter:
        # Process the text to yield list of token strings
        yield [token.text for token in tokenizer(data_sample)]

# Build vocabulary from a list of abstracts
vocab = build_vocab_from_iterator(yield_tokens(abstract_list), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [34]:
# Define text_pipeline function
def text_pipeline(text):
    return [vocab[token.text] for token in tokenizer(text)]

# Define label_pipeline function
def label_pipeline(label):
    return label

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = list(df[['label', 'APPLN_ABSTRACT']].itertuples(index=False, name=None))

dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)

# Define NN Architecture

In [45]:
class FNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text):
        embedded = text.float()
        hidden = self.relu(self.fc1(embedded))
        output = self.sigmoid(self.fc2(hidden))
        return output

In [11]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)  # Additional hidden layer
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        embedded = text.float()
        hidden = self.relu(self.fc1(embedded))
        hidden = self.relu(self.fc2(hidden))
        output = self.sigmoid(self.fc3(hidden))
        return output

In [13]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        assert torch.equal(output[-1, :, :], hidden.squeeze(0))
        return self.sigmoid(self.fc(hidden.squeeze(0)))


In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, _) = self.lstm(embedded)
        return self.sigmoid(self.fc(hidden.squeeze(0)))


In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) 
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.fc(torch.cat(pooled, dim=1))
        return self.sigmoid(cat)


In [52]:
input_dim = len(vocab)
hidden_dim = 32
output_dim = 1

model = FNN(input_dim, hidden_dim, output_dim)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# Move model and criterion to GPU, if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

# Train Model

In [53]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        predictions = model(batch[1]).squeeze(1)
        loss = criterion(predictions, batch[0].float())
        
        loss.backward()
        optimizer.step()

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x31 and 9x32)