In [1]:
import json
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import TransformerMixin
from torch import nn
from torch.utils.data import DataLoader

In [2]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs = 1, n_layers = 1, n_neurons_per_layer = None, leaky_slope = 0.01):
        if n_layers > 0:
            try:
                assert n_neurons_per_layer is not None
                assert len(n_neurons_per_layer) == n_layers
            except Exception as e:
                raise ValueError("provide numbers of neurons per layer defined by n_layers")
        
        super().__init__()
        self.flatten = nn.Flatten()
        if n_layers >= 1:
            stack = [ nn.Linear( n_inputs, n_neurons_per_layer[0] ) ]
            for k in range(n_layers-1):
                stack += [ nn.LeakyReLU(leaky_slope), nn.Linear(n_neurons_per_layer[k], n_neurons_per_layer[k+1]) ]
            stack += [ nn.LeakyReLU(leaky_slope), nn.Linear(n_neurons_per_layer[n_layers-1], n_outputs) ]
        else:
            stack = [ nn.Linear( n_inputs, n_outputs ) ]
        
        self.linear_relu_stack = nn.Sequential( *stack )
        self.prob_predictor = nn.Sigmoid()
        self.loss = nn.BCEWithLogitsLoss()
        for name, param in self.named_parameters():
            param.data.uniform_(-0.5,0.5)
            
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    
    def predict(self, x):
        logits = self.forward(x)
        return self.prob_predictor(logits)
    
    def loss_fn(self, x, y):
        logits = self.forward(x)
        return self.loss(logits, y)

In [3]:
def load_dataset(filepath):
    with open(filepath, "r") as f:
        dataset = [ json.loads(line, parse_int = str) for line in f ]
    return dataset

def get_vectorizer( texts: list[str], *, method="countvectorize", **kwargs ) -> CountVectorizer:
    """From a list of texts, output an appropriate vectorizer either using CountVectorizer or TF-IDF depending on method argument. 

     Args:
         texts (list[str]): list of strings, each item corresponding to a text.
         method (str, optional): Method to select features. Defaults to "count-vectorizer".
         **kwargs: kwarg arguments to pass to Vectorizer classes of sklearn.
    Raises:
        ValueError: If passing an non-specified method of text feature extraction

     Returns:
         pd.DataFrame: dataframe of shape (n_samples, n_features)
    """
    #We want single digits to be tokenized. This regex considers everything as a token except whitespace.
    kwargs['token_pattern'] = r'\S+' 
    if method == "countvectorize":
        vectorizer = CountVectorizer(**kwargs)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(**kwargs)
    else:
        raise ValueError(f"{method} is not a supported method.")
    #Use texts to initialize vocabulary of vectorizer
    vectorizer.fit(texts)
    return vectorizer

def sentencify(text: list) -> str:
    sentence = " ".join(text) 
    return sentence

def loss(clf, X, y):
    probs = clf.predict_log_proba(X)
    y0 = probs[:,1]
    y1 = probs[:,0]
    loss = -y*y0 - (1-y)*y1
    loss = loss.sum()/y.size
    return loss

def balanced_acc(clf, X, y):
    y_pred = clf.predict(X)
    return balanced_accuracy_score(y, y_pred, adjusted=True)

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [4]:
dataset_1 = load_dataset("domain1_train_data.json")
datatexts_1 = [ sentencify(instance['text']) for instance in dataset_1 ]

dataset_2 = load_dataset("domain2_train_data.json")
datatexts_2 = [ sentencify(instance['text']) for instance in dataset_2 ]

In [5]:
tfidf_vectorizer = get_vectorizer( texts = datatexts_1 + datatexts_2,
                                    method='tfidf',
                                    use_idf=True,
                                    ngram_range=(1,3),
                                    max_df=0.995, #Ignore vocabulary appearing too frequently, probably words like "is", "are", "and", "this" etc.
                                    min_df=10, #Ignore vocabulary that is too infrequent, as this may lead to low prediction accuracy.
                                    )
print(f"no features: {tfidf_vectorizer.get_feature_names_out().size}")

no features: 110827


In [6]:
labels = {0:"domain1_ai",
          1:"domain1_human",
          2:"domain2_ai",
          3:"domain2_human"}

In [7]:
X = tfidf_vectorizer.transform( datatexts_1 + datatexts_2 ).toarray()
y = np.array([1]*2500 + [0]*2500 + [3]*1500 + [2]*11500)
lengths = np.zeros( (18000, 2) )
for i in range(5000):
    for k in range(2):
        lengths[i,k] = len(dataset_1[i]['text'])**(k+1)
for i in range(13000):
    for k in range(2):
        lengths[5000+i,k] = len(dataset_2[i]['text'])**(k+1)
lengths[:,0] /= lengths.max(axis=0)[0]
lengths[:,1] /= lengths.max(axis=0)[1]
X = np.hstack( (X, lengths) )

In [8]:
y_vector = np.zeros((18000, 4))
y_vector[(np.arange(18000), y)] = 1

In [9]:
import pickle as pkl
with open("temp_selector_clf.mdl", "rb") as f:
    selector_clf = pkl.load(f)

In [10]:
selector = SelectFromModel(selector_clf, prefit=True)
selector.fit(X, y)
selector.get_feature_names_out().size

26799

In [11]:
X = selector.transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split( X.astype(float), y_vector.astype(float),
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=1,
                                                    stratify=y
                                                   )

In [13]:
mps_device = torch.device("mps")
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()
X_train = X_train.to(mps_device)
y_train = y_train.to(mps_device)
X_test = X_test.to(mps_device)
y_test = y_test.to(mps_device)
train_dataloader = DataLoader(list(zip(X_train,y_train)), shuffle=True, batch_size=X_train.shape[0]//10)

In [14]:
torch.manual_seed(1)
model = NeuralNetwork( X_train.shape[1], 4, 2, [500, 100] )
model.prob_predictor = nn.Softmax(dim=1)
model.loss = nn.CrossEntropyLoss()
model.to(mps_device)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=26799, out_features=500, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=500, out_features=100, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=100, out_features=4, bias=True)
  )
  (prob_predictor): Softmax(dim=1)
  (loss): CrossEntropyLoss()
)

In [15]:
def train_epochs(n_epochs, learning_rate, weight_decay):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    train_losses = []
    test_losses = []
    train_accuracy = []
    test_accuracy = []
    for epoch in range(n_epochs):
        mean_trainloss = 0
        num_trainbatches = len(train_dataloader)
        
        #Begin training
        model.train()
        for X_batch, y_batch in train_dataloader:
            optimizer.zero_grad()
            loss = model.loss_fn(X_batch, y_batch)
            mean_trainloss += loss.item()
            loss.backward()
            optimizer.step()
        mean_trainloss /= num_trainbatches
        train_losses += [ mean_trainloss ]
        
        #Begin evaluation
        model.eval()
        with torch.no_grad():
            testloss = model.loss_fn(X_test, y_test).item()
            test_losses += [testloss]
            y_pred = model.predict(X_test).round()
            y_pred = y_pred.cpu().detach().numpy().reshape(-1)
            accuracy = balanced_accuracy_score(y_test.cpu().detach().numpy().reshape(-1), y_pred)
            test_accuracy += [accuracy]
            
            y_pred = model.predict(X_train).round()
            y_pred = y_pred.cpu().detach().numpy().reshape(-1)
            accuracy = balanced_accuracy_score(y_train.cpu().detach().numpy().reshape(-1), y_pred)
            train_accuracy += [accuracy]
        print(f'Finished epoch {epoch}, latest trainloss {np.round(mean_trainloss,4)}, testloss {np.round( testloss, 4 )}, test accuracy {np.round(test_accuracy[-1],4)}')
    return train_losses, test_losses, train_accuracy, test_accuracy

In [16]:
trains = []

In [17]:
trains += [ train_epochs(n_epochs=10, learning_rate=0.1, weight_decay=0.0) ]

Finished epoch 0, latest trainloss 19.6509, testloss 0.7277, test accuracy 0.8461
Finished epoch 1, latest trainloss 1.5488, testloss 1.2683, test accuracy 0.8863
Finished epoch 2, latest trainloss 0.6891, testloss 0.3336, test accuracy 0.915
Finished epoch 3, latest trainloss 0.151, testloss 0.2387, test accuracy 0.9394
Finished epoch 4, latest trainloss 0.0495, testloss 0.2337, test accuracy 0.9539
Finished epoch 5, latest trainloss 0.0151, testloss 0.2824, test accuracy 0.9475
Finished epoch 6, latest trainloss 0.0058, testloss 0.2942, test accuracy 0.9523
Finished epoch 7, latest trainloss 0.0028, testloss 0.3004, test accuracy 0.9542
Finished epoch 8, latest trainloss 0.0021, testloss 0.3196, test accuracy 0.9547
Finished epoch 9, latest trainloss 0.0013, testloss 0.3378, test accuracy 0.9541


In [18]:
trains += [ train_epochs(n_epochs=20, learning_rate=0.01, weight_decay=0.001) ]

Finished epoch 0, latest trainloss 0.0009, testloss 0.3444, test accuracy 0.9506
Finished epoch 1, latest trainloss 0.001, testloss 0.3567, test accuracy 0.9487
Finished epoch 2, latest trainloss 0.0014, testloss 0.3376, test accuracy 0.9503
Finished epoch 3, latest trainloss 0.0018, testloss 0.3464, test accuracy 0.9474
Finished epoch 4, latest trainloss 0.0022, testloss 0.3429, test accuracy 0.9486
Finished epoch 5, latest trainloss 0.0025, testloss 0.306, test accuracy 0.9494
Finished epoch 6, latest trainloss 0.0029, testloss 0.3147, test accuracy 0.9492
Finished epoch 7, latest trainloss 0.0037, testloss 0.3107, test accuracy 0.948
Finished epoch 8, latest trainloss 0.0041, testloss 0.2815, test accuracy 0.9494
Finished epoch 9, latest trainloss 0.0048, testloss 0.2569, test accuracy 0.9497
Finished epoch 10, latest trainloss 0.0054, testloss 0.2517, test accuracy 0.9507
Finished epoch 11, latest trainloss 0.0062, testloss 0.2649, test accuracy 0.9482
Finished epoch 12, latest tra

In [19]:
trains += [ train_epochs(n_epochs=50, learning_rate=0.001, weight_decay=0.001) ]

Finished epoch 0, latest trainloss 0.0141, testloss 0.2151, test accuracy 0.9481
Finished epoch 1, latest trainloss 0.0132, testloss 0.2081, test accuracy 0.9491
Finished epoch 2, latest trainloss 0.0136, testloss 0.2077, test accuracy 0.9486
Finished epoch 3, latest trainloss 0.014, testloss 0.204, test accuracy 0.9495
Finished epoch 4, latest trainloss 0.0144, testloss 0.2097, test accuracy 0.947
Finished epoch 5, latest trainloss 0.0146, testloss 0.2065, test accuracy 0.9486
Finished epoch 6, latest trainloss 0.0148, testloss 0.2083, test accuracy 0.9484
Finished epoch 7, latest trainloss 0.0148, testloss 0.2112, test accuracy 0.9483
Finished epoch 8, latest trainloss 0.0152, testloss 0.2174, test accuracy 0.9463
Finished epoch 9, latest trainloss 0.015, testloss 0.2037, test accuracy 0.9491
Finished epoch 10, latest trainloss 0.0156, testloss 0.2184, test accuracy 0.9462
Finished epoch 11, latest trainloss 0.0154, testloss 0.2089, test accuracy 0.9483
Finished epoch 12, latest trai

In [20]:
#Now reshuffle the data and re-train as it is likely overfitting to the current training data
X_train, X_test, y_train, y_test = train_test_split( X.astype(float), y_vector.astype(float),
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=2,
                                                    stratify=y
                                                   )

In [21]:
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test).float()
X_train = X_train.to(mps_device)
y_train = y_train.to(mps_device)
X_test = X_test.to(mps_device)
y_test = y_test.to(mps_device)
train_dataloader = DataLoader(list(zip(X_train,y_train)), shuffle=True, batch_size=X_train.shape[0]//10)

In [22]:
final_train = train_epochs(n_epochs=20, learning_rate=0.0001, weight_decay=0.001) 

Finished epoch 0, latest trainloss 0.0561, testloss 0.0478, test accuracy 0.9916
Finished epoch 1, latest trainloss 0.0505, testloss 0.0489, test accuracy 0.9909
Finished epoch 2, latest trainloss 0.0463, testloss 0.0492, test accuracy 0.9906
Finished epoch 3, latest trainloss 0.043, testloss 0.0495, test accuracy 0.9909
Finished epoch 4, latest trainloss 0.0401, testloss 0.0504, test accuracy 0.9903
Finished epoch 5, latest trainloss 0.0375, testloss 0.0511, test accuracy 0.9901
Finished epoch 6, latest trainloss 0.0354, testloss 0.0516, test accuracy 0.9901
Finished epoch 7, latest trainloss 0.0335, testloss 0.0523, test accuracy 0.9901
Finished epoch 8, latest trainloss 0.0319, testloss 0.0529, test accuracy 0.9903
Finished epoch 9, latest trainloss 0.0305, testloss 0.0537, test accuracy 0.9904
Finished epoch 10, latest trainloss 0.0293, testloss 0.054, test accuracy 0.9903
Finished epoch 11, latest trainloss 0.0282, testloss 0.0553, test accuracy 0.9896
Finished epoch 12, latest tr

In [24]:
import pickle as pkl
with open("domain12_nn.mdl", "wb") as f:
    pkl.dump( [tfidf_vectorizer, selector, model], f )