In [5]:
import pandas as pd
import spacy
import torch
from torch import nn
# from torchtext.data import BucketIterator
from torchtext.vocab import build_vocab_from_iterator
from torchviz import make_dot

# Load Data

In [None]:
df_cleantech = pd.read_json('data/cleantech.json')
df_cleantech['label'] = 1
df_non_cleantech = pd.read_json('data/non_cleantech.json')
df_non_cleantech['label'] = 0
df = pd.concat([df_cleantech, df_non_cleantech], ignore_index=True)
# Drop all columns except APPLN_ID and APPLN_ABSTRACT
df.drop(df.columns.difference(['APPLN_ID', 'APPLN_ABSTRACT']), 1, inplace=True)

# Initialize Torchtext Model

In [None]:
# Set up fields for text and label
TEXT = Field(tokenize='spacy', tokenizer_language='en_core_web_sm', lower=True)
LABEL = Field(sequential=False, use_vocab=False)

# Define columns in DataFrame
fields = [('APPLN_ABSTRACT', TEXT), ('label', LABEL)]

In [None]:
# Create a Dataset
dataset = TabularDataset(path=None, format='csv', fields=fields, skip_header=True, csv_reader_params={'header': None}, dataframe=df)

In [None]:
# Build the vocabulary
TEXT.build_vocab(dataset)

# Create iterators for data loading
train_iter = BucketIterator(dataset, batch_size=64, sort_key=lambda x: len(x.APPLN_ABSTRACT), device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'), sort=True, sort_within_batch=True)
test_iter = BucketIterator(dataset, batch_size=64, sort_key=lambda x: len(x.APPLN_ABSTRACT), device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'), sort=True, sort_within_batch=True)

# Define NN Architecture

In [6]:
class FNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text):
        embedded = text.float()
        hidden = self.relu(self.fc1(embedded))
        output = self.sigmoid(self.fc2(hidden))
        return output

In [11]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)  # Additional hidden layer
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        embedded = text.float()
        hidden = self.relu(self.fc1(embedded))
        hidden = self.relu(self.fc2(hidden))
        output = self.sigmoid(self.fc3(hidden))
        return output

In [13]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        assert torch.equal(output[-1, :, :], hidden.squeeze(0))
        return self.sigmoid(self.fc(hidden.squeeze(0)))


In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, _) = self.lstm(embedded)
        return self.sigmoid(self.fc(hidden.squeeze(0)))


In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) 
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.fc(torch.cat(pooled, dim=1))
        return self.sigmoid(cat)


In [None]:
input_dim = len(TEXT.vocab)
hidden_dim = 256
output_dim = 1

model = FNN(input_dim, hidden_dim, output_dim)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# Move model and criterion to GPU, if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

# Train Model

In [None]:
for epoch in range(num_epochs):
    model.train()
    
    for batch in train_iterator:
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label.float())
        
        loss.backward()
        optimizer.step()