In [1]:
import numpy as np
import json
import torch.nn as nn
import torch
from sklearn import preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn import svm


In [2]:
with open('./g2st.txt') as f:
    data = [l for l in f.readlines()]

In [3]:
datapoints = [point.split(':') for point in data]

In [112]:
labels = [int(point[0]) for point in datapoints]
input_long = [json.loads(point[3]) for point in datapoints]
input_short = [json.loads(point[1]) for point in datapoints]

In [5]:
test_pd = pd.DataFrame(input_short)
long_test_pd = pd.DataFrame(input_long)

In [None]:
# long input does not have any NaN
# short input has some NaN in the last column

In [111]:
labels

tensor([0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
        1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 0, 1], device='cuda:0')

## data preprocessing

In [110]:
#np.count_nonzero(test_pd.isna()[6])
test_pd[6] = test_pd[6].fillna(0)
# normalized_test_pd = preprocessing.normalize(test_pd)
# normalized_test_pd = pd.DataFrame(normalized_test_pd, columns=test_pd.columns)
normalized_test_pd = preprocessing.normalize(long_test_pd)
normalized_test_pd = pd.DataFrame(normalized_test_pd, columns=long_test_pd.columns)


In [113]:
X = normalized_test_pd.to_numpy()

In [8]:
#labels

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)

In [10]:
class PolynomialDataset(Dataset):
    def __init__(self, X, Y):
        self.input_data = X
        self.labels = Y

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.input_data[idx], self.labels[idx]

In [11]:
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
train_dataset = PolynomialDataset(X_train, y_train)
test_dataset = PolynomialDataset(X_test, y_test)

In [12]:
X_train.shape

(800000, 7)

In [13]:
batch_size = 100
n_iters = 160000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [105]:
# class FeedforwardNeuralNetModel(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(FeedforwardNeuralNetModel, self).__init__()
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim) 
#         self.sigmoid = nn.Sigmoid()
#         self.fc2 = nn.Linear(hidden_dim, output_dim)  

#     def forward(self, x):
#         out = self.fc1(x)
#         out = self.sigmoid(out)
#         out = self.fc2(out)
#         return out
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim) 
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(hidden_dim, output_dim)  
        
        
        # Define batch norm
        self.batch_norm = nn.BatchNorm1d(hidden_dim)

        # Define proportion or neurons to dropout
        self.dropout = nn.Dropout(0.2)


    def forward(self, x):
        out = self.fc1(x)
        #out = self.dropout(out)
        #out = self.batch_norm(out)
        out = self.relu1(out)
        out = self.fc2(out)
        #out = self.dropout(out)
        #out = self.batch_norm(out)
        out = self.relu2(out)
        out = self.fc3(out)
        #out = self.dropout(out)
        #out = self.batch_norm(out)
        out = self.relu3(out)
        out = self.fc4(out)
        return out
    
class RNN(nn.Module):
    def __init__(self, batch_size, n_inputs, n_neurons):
        super(RNN, self).__init__()
        
        self.rnn = nn.RNNCell(n_inputs, n_neurons)
        self.hx = torch.randn(batch_size, n_neurons) # initialize hidden state
        
    def forward(self, X):
        output = []

        # for each time step
        for i in range(2):
            self.hx = self.rnn(X[i], self.hx)
            output.append(self.hx)
        
        return output, self.hx

class CovNet(nn.Module):   
    def __init__(self):
        super(CovNet, self).__init__()

        self.cnn_layers = nn.Sequential(
            # Defining a 2D convolution layer
            nn.Conv2d(1, 4, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(4),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Defining another 2D convolution layer
            nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(4),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.linear_layers = nn.Sequential(
            nn.Linear(4 * 7 * 7, 2)
        )

    # Defining the forward pass    
    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x
    
    

In [102]:
input_dim = 7
hidden_dim = 100
output_dim = 2

model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
learning_rate = 0.2

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 
criterion = nn.CrossEntropyLoss()

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [52]:
model = model.to(device)

In [53]:
iter = 0

for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = torch.tensor(inputs).requires_grad_()
        inputs = inputs.to(device)
        # Load images with gradient accumulation capabilities
        #images = images.view(-1, 28*28).requires_grad_()
        labels = labels.to(device)
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = model(inputs)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for inputs, labels in test_loader:
                inputs = torch.tensor(inputs).requires_grad_()
                inputs = inputs.to(device)
                # Load images with gradient accumulation capabilities
                #images = images.view(-1, 28*28).requires_grad_()
                labels = labels.to(device)
                # Forward pass only to get logits/output
                outputs = model(inputs)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * float(correct) / float(total)

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))


  """


Iteration: 500. Loss: 0.6656690239906311. Accuracy: 66.8205
Iteration: 1000. Loss: 0.5926105976104736. Accuracy: 69.6605
Iteration: 1500. Loss: 0.6073428988456726. Accuracy: 71.8865
Iteration: 2000. Loss: 0.5559986233711243. Accuracy: 70.698
Iteration: 2500. Loss: 0.5241569876670837. Accuracy: 73.3515
Iteration: 3000. Loss: 0.4888678789138794. Accuracy: 74.4215
Iteration: 3500. Loss: 0.4747890830039978. Accuracy: 74.5055
Iteration: 4000. Loss: 0.4500879645347595. Accuracy: 74.9905
Iteration: 4500. Loss: 0.45948997139930725. Accuracy: 74.79
Iteration: 5000. Loss: 0.50010085105896. Accuracy: 74.852
Iteration: 5500. Loss: 0.49843043088912964. Accuracy: 75.0315
Iteration: 6000. Loss: 0.5296173095703125. Accuracy: 76.3845
Iteration: 6500. Loss: 0.501549482345581. Accuracy: 76.1815
Iteration: 7000. Loss: 0.49102818965911865. Accuracy: 75.9955
Iteration: 7500. Loss: 0.5147882103919983. Accuracy: 76.179
Iteration: 8000. Loss: 0.49469563364982605. Accuracy: 76.9395
Iteration: 8500. Loss: 0.5069

Iteration: 67500. Loss: 0.42426803708076477. Accuracy: 80.398
Iteration: 68000. Loss: 0.491338849067688. Accuracy: 80.201
Iteration: 68500. Loss: 0.3471423089504242. Accuracy: 80.366
Iteration: 69000. Loss: 0.44140616059303284. Accuracy: 80.275
Iteration: 69500. Loss: 0.4585306942462921. Accuracy: 80.138
Iteration: 70000. Loss: 0.4553932845592499. Accuracy: 80.114
Iteration: 70500. Loss: 0.34115487337112427. Accuracy: 80.555
Iteration: 71000. Loss: 0.4786321222782135. Accuracy: 80.316
Iteration: 71500. Loss: 0.4173159897327423. Accuracy: 80.721
Iteration: 72000. Loss: 0.41921550035476685. Accuracy: 80.488
Iteration: 72500. Loss: 0.4515882730484009. Accuracy: 80.623
Iteration: 73000. Loss: 0.42797496914863586. Accuracy: 80.834
Iteration: 73500. Loss: 0.44930049777030945. Accuracy: 80.4655
Iteration: 74000. Loss: 0.3240950107574463. Accuracy: 80.5695
Iteration: 74500. Loss: 0.327170193195343. Accuracy: 80.431
Iteration: 75000. Loss: 0.3896833062171936. Accuracy: 80.554
Iteration: 75500. 

Iteration: 133500. Loss: 0.3756645917892456. Accuracy: 80.736
Iteration: 134000. Loss: 0.41236329078674316. Accuracy: 81.1145
Iteration: 134500. Loss: 0.29130420088768005. Accuracy: 80.8855
Iteration: 135000. Loss: 0.4134744703769684. Accuracy: 81.358
Iteration: 135500. Loss: 0.3195992708206177. Accuracy: 81.1705
Iteration: 136000. Loss: 0.253841370344162. Accuracy: 81.079
Iteration: 136500. Loss: 0.4218124449253082. Accuracy: 80.5195
Iteration: 137000. Loss: 0.3713158369064331. Accuracy: 81.372
Iteration: 137500. Loss: 0.48323681950569153. Accuracy: 80.3315
Iteration: 138000. Loss: 0.3269413709640503. Accuracy: 81.1955
Iteration: 138500. Loss: 0.5085546970367432. Accuracy: 81.27
Iteration: 139000. Loss: 0.49210822582244873. Accuracy: 79.646
Iteration: 139500. Loss: 0.3796399235725403. Accuracy: 80.6245
Iteration: 140000. Loss: 0.4190775752067566. Accuracy: 81.323
Iteration: 140500. Loss: 0.34134727716445923. Accuracy: 80.947
Iteration: 141000. Loss: 0.4400840997695923. Accuracy: 81.07

In [None]:
# 1 hidden layer 50 unit, sigmoid act 
#Iteration: 82000. Loss: 0.5352512001991272. Accuracy: 68.227
# 3 hidden layer 100 unit, sigmoid act 
#Iteration: 82500. Loss: 0.4631064236164093. Accuracy: 79.512

In [109]:
## CovNet


# defining the model
covModel = CovNet()
# defining the optimizer
optimizer = torch.optim.Adam(covModel.parameters(), lr=0.07)
# defining the loss function
criterion = nn.CrossEntropyLoss()
# checking if GPU is available
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()
    
print(model)

FeedforwardNeuralNetModel(
  (fc1): Linear(in_features=7, out_features=100, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=100, out_features=100, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=100, out_features=2, bias=True)
  (batch_norm): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [127]:

def train(epoch):
    model.train()
    tr_loss = 0
    # getting the training set
    x_train, y_train = torch.autograd.Variable(train_x), torch.autograd.Variable(train_y)
    # getting the validation set
    x_val, y_val = torch.autograd.Variable(val_x), torch.autograd.Variable(val_y)
    # converting the data into GPU format
    if torch.cuda.is_available():
        x_train = x_train.cuda()
        y_train = y_train.cuda()
        x_val = x_val.cuda()
        y_val = y_val.cuda()

    # clearing the Gradients of the model parameters
    optimizer.zero_grad()
    
    # prediction for training and validation set
    output_train = model(x_train)
    output_val = model(x_val)

    # computing the training and validation loss
    loss_train = criterion(output_train, y_train)
    loss_val = criterion(output_val, y_val)
    train_losses.append(loss_train)
    val_losses.append(loss_val)

    # computing the updated weights of all the model parameters
    loss_train.backward()
    optimizer.step()
    tr_loss = loss_train.item()
    if epoch%2 == 0:
        # printing the validation loss
        print('Epoch : ',epoch+1, '\t', 'loss :', loss_val)

In [128]:
train_x = torch.from_numpy(X_train)
train_y = torch.from_numpy(np.array(y_train))
val_x = torch.from_numpy(X_test)
val_y = torch.from_numpy(np.array(y_test))
# defining the number of epochs
n_epochs = 10
# empty list to store training losses
train_losses = []
# empty list to store validation losses
val_losses = []
# training the model
for epoch in range(n_epochs):
    train(epoch)

RuntimeError: mat1 dim 1 must match mat2 dim 0

## SVM

In [95]:
clf = svm.SVC(kernel='rbf') # Linear Kernel


In [96]:
chunk_X = X_train[:100000,:]
chunk_Y = y_train[:100000]

In [97]:
#Train the model using the training sets
clf.fit(chunk_X, chunk_Y)

SVC()

In [99]:
X_test_chunk = X_test[:40000,:]
y_pred = clf.predict(X_test_chunk)


In [83]:
len(y_test)

200000

In [100]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test[:40000], y_pred))


Accuracy: 0.723625


In [142]:
tokenizer = AutoTokenizer.from_pretrained("tbs17/MathBERT-custom")

model = AutoModelForMaskedLM.from_pretrained("tbs17/MathBERT-custom")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=569.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440514422.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at tbs17/MathBERT-custom were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [129]:
#train model taking both inputs 
#combine short AND long.
#histogram of data as input
# polynomial SVM