# Model Evaluation

### Setting up Necessary Information to Import the Trained Model

Before importing the trained model, we will need to make sure this notebook is ready to run the model.  Running the following cells will do so.

In [613]:
import pandas as pd

import argparse
import json
import logging
import os
import itertools

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torch.utils.data.distributed

from typing import List
import numpy as np 

## Set up logger to get details of errors
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

Make sure to make the following parameters match those from when the specified model was trained.

In [644]:
# k is for number of nodes in each hidden layer of NN
k = 5000

# For number of inputs (32 binary digits)
input_length = 32
output_length = input_length

# Model Parameters
epochs = 50
batch_size = 256
lr = 0.001
momentum = 0.9
var_weight = 0.0
layers = 4

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = 'noisy' + str(layers) + 'layers_' + str(epochs) + 'epochs_' + str(k) + 'nodes_' + str(batch_size) + 'batch_size_' + str(lr) + 'lr_' + str(var_weight) + 'var_weight'

In [645]:
if layers == 3:
    class Generator(nn.Module):
        def __init__(self, output_length: int):
            super(Generator, self).__init__()
            self.dense_layer = nn.Linear(output_length, k)
            self.dense_layer2 = nn.Linear(k, k)
            self.dense_layer3 = nn.Linear(k, output_length)

        def forward(self, x):
            l1 = self.dense_layer(x)
            l2 = self.dense_layer2(F.relu(l1))
            l3 = self.dense_layer3(F.relu(l2))
            return F.sigmoid(l3)
elif layers == 4:
    class Generator(nn.Module):
        def __init__(self, output_length: int):
            super(Generator, self).__init__()
            self.dense_layer = nn.Linear(output_length, k)
            self.dense_layer2 = nn.Linear(k, k)
            self.dense_layer3 = nn.Linear(k, k)
            self.dense_layer4 = nn.Linear(k, output_length)

        def forward(self, x):
            l1 = self.dense_layer(x)
            l2 = self.dense_layer2(F.relu(l1))
            l3 = self.dense_layer3(F.relu(l2))
            l4 = self.dense_layer4(F.relu(l3))
            return F.sigmoid(l4)
elif layers == 5:
        class Generator(nn.Module):
            def __init__(self, output_length: int):
                super(Generator, self).__init__()
                self.dense_layer = nn.Linear(output_length, k)
                self.dense_layer2 = nn.Linear(k, k)
                self.dense_layer3 = nn.Linear(k, k)
                self.dense_layer4 = nn.Linear(k, k)
                self.dense_layer5 = nn.Linear(k, output_length)

            def forward(self, x):
                l1 = self.dense_layer(x)
                l2 = self.dense_layer2(F.relu(l1))
                l3 = self.dense_layer3(F.relu(l2))
                l4 = self.dense_layer4(F.relu(l3))
                l5 = self.dense_layer5(F.relu(l4))
                return F.sigmoid(l5)

In [646]:
generator = Generator(output_length)
generator = generator.to(device)

### Load in Trained Model

In this step we have to specify the correct model for the 'model_name' variable below.  A list of available models should be available in the 'Trained_Models' folder.

In [647]:
# Make sure to select the correct model
## Model Name defined above with parameters
#model_name = '4layers_100epochs_1000nodes_256batch_size_0.001lr_0.1var_weight'
model_path = './Trained_Models/generator_' + model_name + '.pt'
#model_path = './Trained_Models/generator.pt'
curves_path = './Generated_Curves/' + model_name

# Load in model
generator.load_state_dict(torch.load(model_path)["generator_state_dict"])

<All keys matched successfully>

### Create Noise

This will be the input for the trained generator model.  Having a noisy input will insure that the model has opportunities to generate a variety of different curves.

In [666]:
noise = torch.randint(0, 2, size=(batch_size, output_length)).float().to(device)

### Define function that converts to decimal

The raw output of the generator is list of binary-represented numbers.  This function is able to take in that list and convert the numbers from binary to decimal representation.  

In [667]:
# This function takes in decimal numbers between 0 and 1
def extract(G_of_noise):

    G_numpy = G_of_noise.detach()   

    curves = []

    for i in range(len(G_numpy)):
        c1 = int(G_numpy[i][0].round())
        c2 = ((-1)**(int(G_numpy[i][1].round())))*(int(G_numpy[i][2].round()))
        c3 = int(G_numpy[i][3].round())
        c4 = (-1)**(int(G_numpy[i][4].round()))*(int("".join([str(int(y)) for y in G_numpy[i][5:18].round()]), 2))
        c6 = (-1)**(int(G_numpy[i][12].round()))*(int("".join([str(int(y)) for y in G_numpy[i][19:].round()]), 2))
    
        coef = [c1,c2,c3,c4,c6]
        curves.append(coef)
    return curves

# This function takes in binary digits (0 or 1)
def extract_whole_numbers(G_of_noise):

    G_numpy = G_of_noise  

    curves = []

    for i in range(len(G_numpy)):
        c1 = int(G_numpy[i][0])
        c2 = ((-1)**(int(G_numpy[i][1])))*(int(G_numpy[i][2]))
        c3 = int(G_numpy[i][3])
        c4 = (-1)**(int(G_numpy[i][4]))*(int("".join([str(int(y)) for y in G_numpy[i][5:18]]), 2))
        c6 = (-1)**(int(G_numpy[i][12]))*(int("".join([str(int(y)) for y in G_numpy[i][19:]]), 2))
    
        coef = [c1,c2,c3,c4,c6]
        curves.append(coef)
    return curves

### Generate Results

Now we can input the noisy data into the generator to get a sample of curves.  This is saved under the variable 'listcurves'.

In [668]:
listcurves = extract(generator(noise))



We are mainly interested in unique curves that the generator produces, thus we can filter this list to just contain unique curves.  This is saved under the variable 'unique_curves'.  It is also interesting to know how many of the curves are unique and similarly what percent of the curves are unique.  In this cell we also print out these statistics.

In [669]:
listcurves.sort()
unique_curves = list(listcurves for listcurves,_ in itertools.groupby(listcurves))
print("Number of unique curves generated: {}".format(len(unique_curves)))
print("Percent of generated curves that are unique: {}".format(str(round(len(unique_curves)/batch_size * 100, 2)) + '%'))

Number of unique curves generated: 256
Percent of generated curves that are unique: 100.0%


In a later code cell we will have to call back to these curves as a list inside a txt file.  Therefore, it is important to save these curves.  Saving the curves as a txt file also allows us to revisit the results of this model down the line if we choose to do so.

In [670]:
# Create file that holds the list of unique curves
with open(curves_path + '.txt', "w") as output:
    output.write(str(unique_curves))
    
# Create a list file that is magma compatable
listcurves_magma = "listcurves := " + str(unique_curves) + ";"
with open(curves_path + '_magma.txt', "w") as output:
    output.write(str(listcurves_magma))

### Are the curves in the original dataset?

Another interesting statistic is whether these curves appear in the original curves dataset that we trained the model on.  

In the following cell we import that very dataset.

In [671]:
coef_df_binary = pd.read_csv("https://raw.githubusercontent.com/jcox22/Sagemaker_practice_gan/main/rank_1_curves.csv")
coef_df_binary = coef_df_binary.drop(columns = ['Unnamed: 0'])

Now we can cross check our generated curves to see if they appear in that dataset.  The output of this cell will tell us that information.

In [672]:
all_curves = extract_whole_numbers(coef_df_binary.to_numpy().tolist())

repeated_curves = 0
new_curves = 0
total_curves = len(unique_curves)


for curve in unique_curves:
    if curve in all_curves:
        repeated_curves += 1
    else:
        new_curves +=1
print("Number of repeated curves from original dataset: {}".format(repeated_curves))
print("Number of unique curves in the generated dataset: {}".format(new_curves))
print("Percent of generated curves that are new: {}".format((new_curves/total_curves)*100))

Number of repeated curves from original dataset: 0
Number of unique curves in the generated dataset: 256
Percent of generated curves that are new: 100.0


### Check Rank of Curves

Finally, we can take a look at the most important feature of our model: the rank of the curves it generates. The following code uses Magma Calculator to calculate the rank of the elliptic curves.  This step may take some time.

# MAKE SURE YOU CHANGE CURVES PATH IN BELOW CELL

In [676]:
%%capture output
%%bash

magma -b -s ./Generated_Curves/noisy4layers_50epochs_5000nodes_256batch_size_0.001lr_0.0var_weight_magma.txt

for curve in listcurves do

    P<t> := PolynomialRing(Rationals());

    e := EllipticCurve(curve);
    r := RankBounds(e);
    r;

end for;

Once the calculater is finished, we can take a look at the results.  The following code does a few things.
First, it takes in the output of the calculator and filters it to just the intergers (the ranks of the curves).  This is saved as a list called 'ranks'. 
Second, it saves this ranks list as a text file under the 'Generated_Curves' folder. 
Third, it prints out the percentage of curves that were generated that fall into three buckets: rank 0, rank 1, and greater than rank 1.

In [677]:
output = str(output)

ranks = []
for c in output:
    try:
        ranks.append(int(c))
    except ValueError:
        pass
    
## Save Ranks
with open(curves_path + '_ranks.txt', "w") as output:
    output.write(str(ranks))    
    
print("Percent of generated curves that are rank 0: {}".format(str(round(ranks.count(0)/len(ranks)*100, 2)) + '%'))
print("Percent of generated curves that are rank 1: {}".format(str(round(ranks.count(1)/len(ranks)*100, 2)) + '%'))
print("Percent of generated curves that are greater than rank 1: {}".format(str(round((len(ranks) - (ranks.count(0) + ranks.count(1)))/len(ranks)*100, 2)) + '%'))    

Percent of generated curves that are rank 0: 27.73%
Percent of generated curves that are rank 1: 47.27%
Percent of generated curves that are greater than rank 1: 25.0%


##### Now we know the results of our model

###  Checking Archived Results

Run the following code to get results from past models.  Make sure to change the name of the file to the name you wish to see the results of.

In [610]:
## Change the file name below to the desired file

with open('./Generated_Curves/noisy4layers_100epochs_1000nodes_256batch_size_0.001lr_1.0var_weight_ranks.txt') as f:
    curves_string = f.read()   
    
curve_ranks = curves_string.strip('][').split(', ')

print("Percent of generated curves that are rank 0: {}".format(str(round(curve_ranks.count('0')/len(curve_ranks)*100, 2)) + '%'))
print("Percent of generated curves that are rank 1: {}".format(str(round(curve_ranks.count('1')/len(curve_ranks)*100, 2)) + '%'))
print("Percent of generated curves that are greater than rank 1: {}".format(str(round((len(curve_ranks) - (curve_ranks.count('0') + curve_ranks.count('1')))/len(curve_ranks)*100, 2)) + '%'))    

Percent of generated curves that are rank 0: 29.67%
Percent of generated curves that are rank 1: 48.35%
Percent of generated curves that are greater than rank 1: 21.98%
