In [1]:
import json
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import TransformerMixin
from torch import nn
from torch.utils.data import DataLoader

In [2]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs = 1, n_layers = 1, n_neurons_per_layer = None, leaky_slope = 0.01):
        if n_layers > 0:
            try:
                assert n_neurons_per_layer is not None
                assert len(n_neurons_per_layer) == n_layers
            except Exception as e:
                raise ValueError("provide numbers of neurons per layer defined by n_layers")
        
        super().__init__()
        self.flatten = nn.Flatten()
        if n_layers >= 1:
            stack = [ nn.Linear( n_inputs, n_neurons_per_layer[0] ) ]
            for k in range(n_layers-1):
                stack += [ nn.LeakyReLU(leaky_slope), nn.Linear(n_neurons_per_layer[k], n_neurons_per_layer[k+1]) ]
            stack += [ nn.LeakyReLU(leaky_slope), nn.Linear(n_neurons_per_layer[n_layers-1], n_outputs) ]
        else:
            stack = [ nn.Linear( n_inputs, n_outputs ) ]
        
        self.linear_relu_stack = nn.Sequential( *stack )
        self.prob_predictor = nn.Sigmoid()
        self.loss = nn.BCEWithLogitsLoss()
        for name, param in self.named_parameters():
            param.data.uniform_(-0.5,0.5)
            
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    
    def predict(self, x):
        logits = self.forward(x)
        return self.prob_predictor(logits)
    
    def loss_fn(self, x, y):
        logits = self.forward(x)
        return self.loss(logits, y)

In [3]:
def load_dataset(filepath):
    with open(filepath, "r") as f:
        dataset = [ json.loads(line, parse_int = str) for line in f ]
    return dataset

def get_vectorizer( texts: list[str], *, method="countvectorize", **kwargs ) -> CountVectorizer:
    """From a list of texts, output an appropriate vectorizer either using CountVectorizer or TF-IDF depending on method argument. 

     Args:
         texts (list[str]): list of strings, each item corresponding to a text.
         method (str, optional): Method to select features. Defaults to "count-vectorizer".
         **kwargs: kwarg arguments to pass to Vectorizer classes of sklearn.
    Raises:
        ValueError: If passing an non-specified method of text feature extraction

     Returns:
         pd.DataFrame: dataframe of shape (n_samples, n_features)
    """
    #We want single digits to be tokenized. This regex considers everything as a token except whitespace.
    kwargs['token_pattern'] = r'\S+' 
    if method == "countvectorize":
        vectorizer = CountVectorizer(**kwargs)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(**kwargs)
    else:
        raise ValueError(f"{method} is not a supported method.")
    #Use texts to initialize vocabulary of vectorizer
    vectorizer.fit(texts)
    return vectorizer

def sentencify(text: list) -> str:
    sentence = " ".join(text) 
    return sentence

def loss(clf, X, y):
    probs = clf.predict_log_proba(X)
    y0 = probs[:,1]
    y1 = probs[:,0]
    loss = -y*y0 - (1-y)*y1
    loss = loss.sum()/y.size
    return loss

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [4]:
dataset_2 = load_dataset("domain2_train_data.json")
datatexts_2 = [ sentencify(instance['text']) for instance in dataset_2 ]

In [5]:
tfidf_vectorizer = get_vectorizer( texts = datatexts_2,
                                    method='tfidf',
                                    use_idf=True,
                                    ngram_range=(1,3),
                                    max_df=0.995, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                                    min_df=10, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                                    )
print(f"no features: {tfidf_vectorizer.get_feature_names_out().size}")

no features: 91931


In [6]:
X2 = tfidf_vectorizer.transform( datatexts_2 ).toarray()
y = np.array([1]*1500 + [0]*11500)
lengths = np.zeros( (13000, 2) )
for i in range(13000):
    for k in range(2):
        lengths[i,k] = len(dataset_2[i]['text'])**(k+1)
lengths[:,0] /= lengths.max(axis=0)[0]
lengths[:,1] /= lengths.max(axis=0)[1]
X2 = np.hstack( (X2, lengths) )
X2.shape

(13000, 91933)

In [7]:
selector_clf = LogisticRegression(C=0.1, random_state=0)
selector_clf.fit(X2, y)
selector = SelectFromModel(selector_clf, prefit=True)
selector.fit(X2, y)
selector.get_feature_names_out().size

27241

In [8]:
X2_selected = selector.transform(X2)

In [9]:
X_train, X_test, y_train, y_test = train_test_split( X2_selected.astype(float), y.astype(float),
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=1,
                                                    stratify=y
                                                   )

In [10]:
mps_device = torch.device("mps")
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float().reshape(-1,1)
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float().reshape(-1,1)
X_train = X_train.to(mps_device)
y_train = y_train.to(mps_device)
X_test = X_test.to(mps_device)
y_test = y_test.to(mps_device)
train_dataloader = DataLoader(list(zip(X_train,y_train)), shuffle=True, batch_size=X_train.shape[0]//10)

In [11]:
torch.manual_seed(1)
model = NeuralNetwork( X_train.shape[1], 1, 1, [500] )
model.to(mps_device)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=27241, out_features=500, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=500, out_features=1, bias=True)
  )
  (prob_predictor): Sigmoid()
  (loss): BCEWithLogitsLoss()
)

In [12]:
def train_epochs(n_epochs, learning_rate, weight_decay):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    train_losses = []
    test_losses = []
    train_accuracy = []
    test_accuracy = []
    for epoch in range(n_epochs):
        mean_trainloss = 0
        num_trainbatches = len(train_dataloader)
        
        #Begin training
        model.train()
        for X_batch, y_batch in train_dataloader:
            optimizer.zero_grad()
            loss = model.loss_fn(X_batch, y_batch)
            mean_trainloss += loss.item()
            loss.backward()
            optimizer.step()
        mean_trainloss /= num_trainbatches
        train_losses += [ mean_trainloss ]
        
        #Begin evaluation
        model.eval()
        with torch.no_grad():
            testloss = model.loss_fn(X_test, y_test).item()
            test_losses += [testloss]
            y_pred = model.predict(X_test).round()
            y_pred = y_pred.cpu().detach().numpy().reshape(-1)
            accuracy = balanced_accuracy_score(y_test.cpu().detach().numpy().reshape(-1), y_pred)
            test_accuracy += [accuracy]
            
            y_pred = model.predict(X_train).round()
            y_pred = y_pred.cpu().detach().numpy().reshape(-1)
            accuracy = balanced_accuracy_score(y_train.cpu().detach().numpy().reshape(-1), y_pred)
            train_accuracy += [accuracy]
        print(f'Finished epoch {epoch}, latest trainloss {np.round(mean_trainloss,4)}, testloss {np.round( testloss, 4 )}, test accuracy {np.round(test_accuracy[-1],4)}')
    return train_losses, test_losses, train_accuracy, test_accuracy

In [13]:
trains = []

In [14]:
trains += [ train_epochs(n_epochs=10, learning_rate=0.1, weight_decay=0.0) ]

Finished epoch 0, latest trainloss 3.303, testloss 0.3848, test accuracy 0.6414
Finished epoch 1, latest trainloss 0.1229, testloss 0.1132, test accuracy 0.9474
Finished epoch 2, latest trainloss 0.0197, testloss 0.166, test accuracy 0.8125
Finished epoch 3, latest trainloss 0.0052, testloss 0.1104, test accuracy 0.8912
Finished epoch 4, latest trainloss 0.0017, testloss 0.098, test accuracy 0.9093
Finished epoch 5, latest trainloss 0.0007, testloss 0.1164, test accuracy 0.8801
Finished epoch 6, latest trainloss 0.0006, testloss 0.1116, test accuracy 0.8966
Finished epoch 7, latest trainloss 0.0004, testloss 0.1077, test accuracy 0.9047
Finished epoch 8, latest trainloss 0.0004, testloss 0.1084, test accuracy 0.9047
Finished epoch 9, latest trainloss 0.0003, testloss 0.1104, test accuracy 0.9047


In [15]:
trains += [ train_epochs(n_epochs=10, learning_rate=0.001, weight_decay=0.01) ]

Finished epoch 0, latest trainloss 0.0003, testloss 0.1063, test accuracy 0.903
Finished epoch 1, latest trainloss 0.0003, testloss 0.1025, test accuracy 0.9012
Finished epoch 2, latest trainloss 0.0004, testloss 0.0989, test accuracy 0.9012
Finished epoch 3, latest trainloss 0.0004, testloss 0.0957, test accuracy 0.9012
Finished epoch 4, latest trainloss 0.0005, testloss 0.0927, test accuracy 0.9012
Finished epoch 5, latest trainloss 0.0006, testloss 0.0901, test accuracy 0.9012
Finished epoch 6, latest trainloss 0.0007, testloss 0.0877, test accuracy 0.9012
Finished epoch 7, latest trainloss 0.0009, testloss 0.0857, test accuracy 0.8962
Finished epoch 8, latest trainloss 0.001, testloss 0.0839, test accuracy 0.8962
Finished epoch 9, latest trainloss 0.0013, testloss 0.0825, test accuracy 0.8926


In [16]:
trains += [train_epochs(n_epochs=10, learning_rate=0.0001, weight_decay=0.001)]

Finished epoch 0, latest trainloss 0.0014, testloss 0.083, test accuracy 0.8928
Finished epoch 1, latest trainloss 0.0014, testloss 0.0834, test accuracy 0.8928
Finished epoch 2, latest trainloss 0.0013, testloss 0.0839, test accuracy 0.8928
Finished epoch 3, latest trainloss 0.0013, testloss 0.0843, test accuracy 0.8928
Finished epoch 4, latest trainloss 0.0013, testloss 0.0848, test accuracy 0.8928
Finished epoch 5, latest trainloss 0.0013, testloss 0.0851, test accuracy 0.8912
Finished epoch 6, latest trainloss 0.0013, testloss 0.0855, test accuracy 0.8912
Finished epoch 7, latest trainloss 0.0013, testloss 0.0858, test accuracy 0.8912
Finished epoch 8, latest trainloss 0.0013, testloss 0.0861, test accuracy 0.8912
Finished epoch 9, latest trainloss 0.0013, testloss 0.0864, test accuracy 0.8914


In [17]:
#Now reshuffle the data and re-train as it is likely overfitting to the current training data
X_train, X_test, y_train, y_test = train_test_split( X2_selected.astype(float), y.astype(float),
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=2,
                                                    stratify=y
                                                   )

In [18]:
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float().reshape(-1,1)
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float().reshape(-1,1)
X_train = X_train.to(mps_device)
y_train = y_train.to(mps_device)
X_test = X_test.to(mps_device)
y_test = y_test.to(mps_device)
train_dataloader = DataLoader(list(zip(X_train,y_train)), shuffle=True, batch_size=X_train.shape[0]//10)

In [19]:
finaltrain = []

In [20]:
finaltrain += [ train_epochs(n_epochs=10, learning_rate=0.0001, weight_decay=0.001) ]

Finished epoch 0, latest trainloss 0.0185, testloss 0.0171, test accuracy 0.9831
Finished epoch 1, latest trainloss 0.0183, testloss 0.0169, test accuracy 0.9831
Finished epoch 2, latest trainloss 0.018, testloss 0.0167, test accuracy 0.9831
Finished epoch 3, latest trainloss 0.0178, testloss 0.0166, test accuracy 0.9831
Finished epoch 4, latest trainloss 0.0177, testloss 0.0164, test accuracy 0.9831
Finished epoch 5, latest trainloss 0.0175, testloss 0.0163, test accuracy 0.9831
Finished epoch 6, latest trainloss 0.0173, testloss 0.0161, test accuracy 0.9831
Finished epoch 7, latest trainloss 0.0171, testloss 0.016, test accuracy 0.9831
Finished epoch 8, latest trainloss 0.017, testloss 0.0159, test accuracy 0.9831
Finished epoch 9, latest trainloss 0.0169, testloss 0.0158, test accuracy 0.9831


In [21]:
import pickle as pkl
with open("domain2_nn.mdl", "wb") as f:
    pkl.dump( [tfidf_vectorizer, selector_clf, selector, model], f)