# Analysis for cfg2vec applied to code workout dataset

This analysis will attempt to train a model that can be
used to predict the grade a student will get on a
programming assignment given only their code

In [3]:
import numpy as np

# Load in the environment variables from the .env file containing DB info
from dotenv import load_dotenv

loaded = load_dotenv('../.env')

In [4]:
# Specify the different types of embedding methods that could have been used
# in embedding the nodes
import itertools

embedding_methods = [
    embedding_method
    for embedding_method
    in itertools.product(
        ['deepwalk', 'node2vec'],
        ['FC', 'VC', 'LC', 'NC']
    )
]

In [5]:
import torch
from torch import nn
import logging

## Graph Embedding Layer

### Input

The input will be a `batch size x max number of graphs x max number of nodes x length of node embedding` tensor

### Output

The output will be a `batch size x max number of graphs x length of node embedding` tensor representing the graph embedding

### Layers

#### Weight Generation

This will be a Linear or Dense Layer

##### Input

The input will be a `batch size x max number of graphs x max number of nodes x length of node embedding` tensor

##### Output

The output will be a `max number of nodes x 1` tensor

##### Description

The weight generation layer will be used to train up an attention matrix, and will have an inner tensor of size `length of node embedding x 1`

#### Softmax Layer

##### Input

The input will be a `max number of nodes x 1` tensor

##### Output

The output will be a `max number of nodes x 1` tensor

##### Description

This will normalize the results from the weight generation layer into a probability distribution.

#### Dot Product Layer

##### Input

The input will be a `batch size x max number of graphs x max number of nodes x length of node embedding` tensor and a `max number of nodes x 1` tensor

##### Operations

1. i1 will be transposed across dim 2 and 3
2. i1 will be matmul'ed with i2
3. i1 will be reshaped to remove the dangling dimension

##### Output

The output will be a `batch size x max number of graphs x length of node embedding` tensor


In [6]:
class GraphEmbedding(nn.Module):
    def __init__(self, node_embedding_length: int):
        super(GraphEmbedding, self).__init__()
        self.get_weights = nn.Linear(node_embedding_length, 1)
        # Chose dim = 2 because we want the prob_dist to sum to 1 for each of the graphs
        self.make_prob_dist = nn.Softmax(dim = 2)

    # batch dims: batch size
    #             x max number of graphs
    #             x max number of nodes
    #             x length of node embedding
    def forward(self, batch):
        # weights dims: max number of nodes x 1
        logging.debug(f'self.get_weights.weight: {self.get_weights.weight}')
        weights = self.get_weights(batch)
        logging.debug(f'weights.shape: {weights.shape}')
        # prob_dist dims: max number of nodes x 1
        prob_dist = self.make_prob_dist(weights)
        logging.debug(f'prob_dist.shape: {prob_dist.shape}')

        # transposed dims: batch size
        #                  x max number of graphs
        #                  x length of node embedding
        #                  x max number of nodes
        num_dimensions = len(batch.shape)
        transposed = torch.transpose(batch, num_dimensions - 2, num_dimensions - 1)
        logging.debug(f'transposed.shape: {transposed.shape}')

        # returned dims: batch size
        #                x max number of graphs
        #                x length of node embedding
        graph_embeddings = torch.matmul(transposed, prob_dist).squeeze()
        logging.debug(f'graph_embeddings.shape: {graph_embeddings.shape}')
        return graph_embeddings


## Program Embedding Layer

### Input

The input will be a `batch size x max number of graphs x max number of nodes x length of node embedding` tensor

### Output

The output will be a `batch size x length of node embedding` tensor

### Layers

#### Graph Embedding Layer

##### Input

The input will be a `batch size x max number of graphs x max number of nodes x length of node embedding` tensor

##### Output

The output will be a `batch size x max number of graphs x length of node embedding` tensor

#### Weight Generation Layer

##### Input

The input will be a `batch size x max number of graphs x length of node embedding` tensor

##### Output

The output will be a `max number of graphs x 1` tensor

##### Description

This layer will generate weights for each programs' graph embeddings so that we can sum them up, and will have an inner matrix of size `length of node embedding x 1`

#### Softmax Layer

##### Input

The input will be a `max number of graphs x 1` tensor

##### Output

The output will be a `max number of graphs x 1` tensor

##### Description

This layer will be used to convert the graph weights to a probability distribution

#### Input dot product

##### Input

The input will be a `batch size x max number of graphs x length of node embedding` tensor and a `max number of graphs x 1` tensor

##### Operations

1. i1 will be transposed across dim 1 and 2
2. i1 will be matmul'ed with i2
3. i1 will be reshaped to remove the dangling dimension

##### Output

The output will be a `batch size x length of node embedding` tensor

In [7]:
class CfgProgramEmbedding(nn.Module):
    def __init__(self, node_embedding_length: int):
        super(CfgProgramEmbedding, self).__init__()
        self.embed_graphs = GraphEmbedding(node_embedding_length)
        self.get_weights = nn.Linear(node_embedding_length, 1)
        # Chose dim = 1 because we want to have the prob_dist sum to 1 for each of the graphs
        self.make_prob_dist = nn.Softmax(dim = 1)

    # batch dims: batch size
    #             x max number of graphs
    #             x max number of nodes
    #             x length of node embedding
    def forward(self, batch):
        # graph_embeddings dims: batch size
        #                        x max number of graphs
        #                        x length of node embedding
        graph_embeddings = self.embed_graphs(batch)
        logging.debug(f'graph_embeddings.shape: {graph_embeddings.shape}')

        # weights dims: max number of graphs x 1
        weights = self.get_weights(graph_embeddings)
        logging.debug(f'weights.shape: {weights.shape}')
        # prob_dist dims: max number of graphs x 1
        prob_dist = self.make_prob_dist(weights)
        logging.debug(f'prob_dist.shape: {prob_dist.shape}')

        # transposed dims: batch size
        #                  x length of node embedding
        #                  x max number of graphs
        num_dimensions = len(graph_embeddings.shape)
        transposed = torch.transpose(graph_embeddings, num_dimensions - 2, num_dimensions - 1)
        logging.debug(f'transposed.shape: {transposed.shape}')

        # program_embeddings dims:  batch size
        #                           x length of node embedding
        program_embeddings = torch.matmul(transposed, prob_dist).squeeze()
        logging.debug(f'program_embeddings.shape: {program_embeddings.shape}')
        return program_embeddings

class Code2VecProgramEmbedding(nn.Module):
    def __init__(self, method_embedding_length):
        super(Code2VecProgramEmbedding, self).__init__()
        self.get_weights = nn.Linear(method_embedding_length, 1)
        # Chose dim = 1 because we want to have the prob_dist sum to 1 for each of the graphs
        self.make_prob_dist = nn.Softmax(dim = 1)

    def forward(self, batch):
        # weights dims: max number of graphs x 1
        weights = self.get_weights(batch)
        logging.debug(f'weights.shape: {weights.shape}')
        # prob_dist dims: max number of graphs x 1
        prob_dist = self.make_prob_dist(weights)
        logging.debug(f'prob_dist.shape: {prob_dist.shape}')

        # transposed dims: batch size
        #                  x length of graph embedding
        #                  x max number of graphs
        num_dimensions = len(batch.shape)
        transposed = torch.transpose(batch, num_dimensions - 2, num_dimensions - 1)
        logging.debug(f'transposed.shape: {transposed.shape}')

        # program_embeddings dims:  batch size
        #                           x length of graph embedding
        program_embeddings = torch.matmul(transposed, prob_dist).squeeze()
        logging.debug(f'program_embeddings.shape: {program_embeddings.shape}')
        return program_embeddings





## Baseline Naive Model

### Input

The input will be a `batch size x max number of graphs x max number of nodes x length of node embedding` tensor

### Output

The output will be a `program batch size x 1` tensor with a predicted score for each program

### Layers

#### Program Embedding Layer

##### Input

The input will be a `batch size x max number of graphs x max number of nodes x length of node embedding` tensor

##### Output

The output will be a `batch size x length of node embedding` tensor

##### Description

This will generate program embeddings for each student program

#### Prediction Layer

##### Inputs

The input will be a `batch size x length of node embedding` tensor

##### Outputs

The output will be a `batch size x 1` tensor


In [8]:
class CodeWorkoutPredictor(nn.Module):
    def __init__(self, node_embedding_length: int):
        super(CodeWorkoutPredictor, self).__init__()
        self.embed_programs = CfgProgramEmbedding(node_embedding_length)
        self.predict_grades = nn.Linear(node_embedding_length, 1)

    # batch dims: batch size
    #             x max number of graphs
    #             x max number of nodes
    #             x length of node embedding
    def forward(self, batch):
        # program_embeddings dims:  batch size
        #                           x length of node embedding
        program_embeddings = self.embed_programs(batch)
        logging.debug(f'program_embeddings.shape: {program_embeddings.shape}')

        # returned dims: batch size
        predictions = self.predict_grades(program_embeddings).squeeze()
        logging.debug(f'predictions.shape: {predictions.shape}')
        return predictions

In [9]:
class CfgMisconceptionClassifier(nn.Module):
    def __init__(self, node_embedding_length: int):
        super(CfgMisconceptionClassifier, self).__init__()
        self.embed_programs = CfgProgramEmbedding(node_embedding_length)
        self.classify_misconceptions = nn.Sequential(nn.Linear(node_embedding_length, 20), nn.ReLU(), nn.Linear(20, 11), nn.Sigmoid())

    # batch dims: batch size
    #             x max number of graphs
    #             x max number of nodes
    #             x length of node embedding
    def forward(self, batch):
        # program_embeddings dims:  batch size
        #                           x length of node embedding
        program_embeddings = self.embed_programs(batch)
        logging.debug(f'program_embeddings.shape: {program_embeddings.shape}')

        # returned dims: batch size
        classifications = self.classify_misconceptions(program_embeddings).squeeze()
        logging.debug(f'predictions.shape: {classifications.shape}')
        return classifications

class Code2VecMisconceptionClassifier(nn.Module):
    def __init__(self, method_embedding_length: int):
        super(Code2VecMisconceptionClassifier, self).__init__()
        self.embed_programs = CfgProgramEmbedding(method_embedding_length)
        self.classify_misconceptions = nn.Sequential(nn.Linear(method_embedding_length, 20), nn.ReLU(), nn.Linear(20, 11), nn.Sigmoid())

    # batch dims: batch size
    #             x max number of graphs
    #             x max number of nodes
    #             x length of node embedding
    def forward(self, batch):
        # program_embeddings dims:  batch size
        #                           x length of node embedding
        program_embeddings = self.embed_programs(batch)
        logging.debug(f'program_embeddings.shape: {program_embeddings.shape}')

        # returned dims: batch size
        classifications = self.classify_misconceptions(program_embeddings).squeeze()
        logging.debug(f'predictions.shape: {classifications.shape}')
        return classifications

# Load Data

In [10]:
from psycopg import Connection, connect as db_connect
from dataclasses import dataclass
from psycopg.rows import namedtuple_row
from gensim.models import Word2Vec
import torch
from typing import Optional
import torch.utils.data
import itertools

device = 'cuda' if torch.cuda.is_available() else 'cpu'
logging.info(f'Using {device} device')

torch.multiprocessing.set_start_method('spawn')

In [11]:
DROP_IF_EXISTS_MAT_VIEW = 'drop materialized view if exists all_program_nodes'
CREATE_MAT_VIEW = '''
create materialized view all_program_nodes as
select p.strat       as strat,
       p.program_id  as program,
       p.repo        as repo,
       p.commit_id   as commit_id,
       g.graph_id    as graph,
       n.contents    as content,
       n.label       as label,
       n.node_type   as ntype,
       p.program_num as program_num,
       p.train       as train
from (select *,
             arr[1]                                                      as repo,
             arr[array_length(arr, 1)]                                   as commit_id,
             rank() over (partition by strat, test order by shuffle_val) as program_num,
             not test                                                    as train
      from (select *,
                   string_to_array(regexp_replace(program_id, '^[VFNL]C-', ''), '_') as arr,
                   substring(program_id for 2)                                       as strat,
                   random()                                                          as shuffle_val,
                   case when random() > .8 then true else false end                  as test
            from programs
            where program_id like '%Lab12%DecimalToBinary.java%') as sp) as p
         join graphs as g on p.id = g.program_id
         join nodes as n on g.id = n.graph_id
'''

CREATE_STRAT_INDEX = 'create index can_strat on all_program_nodes (strat, train)'
CREATE_ROW_INDEX = 'create index node_row_num on all_program_nodes (strat, train, program_num)'
CREATE_PROGRAM_INDEX = 'create index strat_program on all_program_nodes (strat, train, program)'
CREATE_GRAPH_INDEX = 'create index strat_program_graphs on all_program_nodes (strat, train, program, graph)'

def create_materialized_view(conn: Connection):
    conn.execute(DROP_IF_EXISTS_MAT_VIEW)
    conn.execute(CREATE_MAT_VIEW)
    conn.execute(CREATE_STRAT_INDEX)
    conn.execute(CREATE_ROW_INDEX)
    conn.execute(CREATE_PROGRAM_INDEX)
    conn.execute(CREATE_GRAPH_INDEX)

In [12]:
GET_STRAT_DIMENSIONS = '''
-- dimension 0 - programs
with dim0 as (select count(program) as dim,
                     strat,
                     train
              from (select strat, train, program from all_program_nodes group by strat, train, program) as p
              group by strat, train),

-- dimension 1 - max number of graphs
     dim1 as (select max(num_graphs) as dim,
                     strat,
                     train
              from (select count(*) num_graphs,
                           strat,
                           train,
                           program
                    from (select strat,
                                 train,
                                 program,
                                 graph
                          from all_program_nodes
                          group by strat, train, program, graph) graphs
                    group by strat,
                             train,
                             program) graph_counts
              group by strat,
                       train),

-- dimension 2 - max number of nodes (dimension 3 is the node embeddings which are size 50)
     dim2 as (select max(num_graphs) as dim,
                     strat,
                     train
              from (select count(graph) as num_graphs,
                           strat,
                           train
                    from all_program_nodes
                    group by strat,
                             train,
                             program,
                             graph
                    order by num_graphs desc) num_nodes
              group by strat,
                       train),
-- all strat train pairs and their respective shapes
     strat_train_pairs as (select dim0.strat as strat,
                                  dim0.train,
                                  dim0.dim   as dim0,
                                  dim1.dim   as dim1,
                                  dim2.dim   as dim2
                           from dim0
                                    join dim1 on dim0.strat = dim1.strat and dim0.train = dim1.train
                                    join dim2 on dim0.strat = dim2.strat and dim0.train = dim2.train)

select strat,
       train,
       dim0,
       max(dim1) over (partition by strat) as dim1,
       max(dim2) over (partition by strat) as dim2,
       50                                  as dim3
from strat_train_pairs
'''

def set_shapes(conn: Connection, shapes: dict[(str, bool), tuple[int, int, int, int]]):
    cursor = conn.execute(GET_STRAT_DIMENSIONS)
    cursor.row_factory = namedtuple_row

    results = cursor.fetchall()
    for shape in results:
        shapes[(shape.strat, shape.train)] = (shape.dim0, shape.dim1, shape.dim2, shape.dim3)

In [13]:
GET_BATCH = '''
select
    strat,
    program,
    tag_id,
    graph,
    content,
    label,
    ntype,
    program_num
from
    all_program_nodes
where
    all_program_nodes.program_num >= %(lower_bound)s and
    all_program_nodes.program_num < %(upper_bound)s and
    strat = %(canonicalization_strategy)s and
    train = %(train)s
order by
    program_num, strat, program;
'''


import pandas as pd
from pathlib import Path
from typing import Iterable
import csv

data_dir = Path.cwd() / '..' / 'data' / 'Data'

df = pd.read_csv(data_dir / 'tagged_submissions_lab_12.csv')
df.drop(columns=[col for col in df.columns if col.endswith('Output')], inplace=True)
df.set_index(keys='id', inplace=True)
tags_titles = [c for c in df.columns]
df['target'] = [[e for e in x] for _, x in df.iterrows()]
df.drop(columns=df.columns[:-1], inplace=True)
tags = df.to_dict('index')

def get_tags(programs: Iterable[str], ) -> list[int]:
    logging.debug(tags)
    logging.debug(programs)
    return [tags['csc' + tag_id]['target'] for tag_id in programs]


def content_string(node_type: str, label: Optional[str], contents: Optional[str]) -> str:
    if node_type == 'Source':
        return f'Source {contents}'
    if node_type == 'Sink':
        return f'Sink {contents}'
    out_label = f'{label}: ' if label else ''
    out_contents = contents if contents else ''
    return f'{out_label}{out_contents}'

class Node:
    def __init__(self, content: str):
        self.content: str = content

class Graph:
    def __init__(self):
        self.nodes: list[Node] = []


class Program:
    def __init__(self, program_id):
        self.graphs: dict[str, Graph] = {}
        self.program_id: str = program_id


class Batch:
    def __init__(self, strat):
        self.programs: dict[str, Program] = {}
        self.strat = strat + '-'

    def to_tensor(self, w2v: Word2Vec, batch_size: int, max_graphs: int, max_nodes: int) -> tuple[torch.Tensor, torch.Tensor]:
        data = np.zeros((batch_size, max_graphs, max_nodes, 50), dtype=np.float32)
        program_ids = []
        for p_index, (program_id, program) in enumerate(self.programs.items()):
            for g_index, graph in enumerate(program.graphs.values()):
                for n_index, node in enumerate(graph.nodes):
                    if node.content in w2v.wv:
                        data[p_index, g_index, n_index] = w2v.wv[node.content]
            program_ids.append(program_id[len(self.strat):])

        batch_tags = get_tags(program_ids)
        logging.debug(batch_tags)
        return torch.as_tensor(data, dtype = torch.float32, device = torch.device(device)).squeeze(), torch.tensor(batch_tags, dtype = torch.float32, device = torch.device(device))

def get_prog_batch(conn: Connection, lower_bound: int, upper_bound: int, canonicalization_strategy: str, train: bool) -> Batch:
    batch = Batch(canonicalization_strategy)
    cursor = conn.execute(GET_BATCH, {'lower_bound': lower_bound, 'upper_bound': upper_bound, 'canonicalization_strategy': canonicalization_strategy, 'train': train})
    cursor.row_factory = namedtuple_row
    for row in cursor:
        if not row.tag_id in batch.programs:
            batch.programs[row.tag_id] = Program(row.tag_id)
        if not row.graph in batch.programs[row.tag_id].graphs:
            batch.programs[row.tag_id].graphs[row.graph] = Graph()
        batch.programs[row.tag_id].graphs[row.graph].nodes.append(Node(content_string(row.ntype, row.label, row.content)))

    return batch

In [None]:
class BatchedProgramDataset(torch.utils.data.IterableDataset):
    def __init__(
            self,
            canonicalization_strategy: str,
            w2v_model: str,
            batch_size: int,
            num_programs: int,
            max_graphs: int,
            max_nodes: int,
            train: bool
    ):
        super(BatchedProgramDataset, self).__init__()
        self.canonicalization_strategy = canonicalization_strategy
        self.num_programs = num_programs
        self.batch_size = batch_size
        self.w2v_model = w2v_model
        self.max_graphs = max_graphs
        self.max_nodes = max_nodes
        self.train = train

    def _get_worker_bounds(
            self,
            worker_id: int,
            num_workers: int
    ) -> tuple[int, int]:
        total_batches = self.num_programs // self.batch_size
        (num_batches, batchs_leftover) = divmod(total_batches, num_workers)

        def worker_start_bound(num):
            offset = num if num < batchs_leftover else batchs_leftover
            return self.batch_size * (num_batches * num + offset) + 1

        return worker_start_bound(worker_id), worker_start_bound(worker_id + 1)


    def __len__(self):
        return self.num_programs // self.batch_size


    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        (lower_bound, upper_bound) = (1, self.num_programs + 1) if worker_info is None else self._get_worker_bounds(worker_info.id, worker_info.num_workers)
        conn = db_connect()
        w2v = Word2Vec.load(self.w2v_model)
        for lower_bound, upper_bound in itertools.pairwise(range(lower_bound, upper_bound + 1, self.batch_size)):
            yield get_prog_batch(
                conn,
                lower_bound,
                upper_bound,
                self.canonicalization_strategy,
                self.train
            ).to_tensor(
                w2v,
                self.batch_size,
                self.max_graphs,
                self.max_nodes
            )

In [14]:
from enum import Enum, auto
import math


class SplitKind(Enum):
    Train = auto()
    Test = auto()
    Validation = auto()


train = (None, 0)
validation = (None, 0)
test = (None, 0)

def get_data_split(vector_file: Path, batch_size: int, split: SplitKind) -> tuple[dict[str, pd.Series], int]:
    programs = {}
    vectors = pd.read_csv(vector_file)
    max_methods = 0
    for vector in vectors.iterrows():
        program_id = vector[1]['CodeStateID']
        if program_id not in programs:
            programs[program_id] = []
        programs[program_id].append(vector[1].drop('CodeStateID'))
        max_methods = max(max_methods, len(programs[program_id]))
    num_batches = len(programs) // batch_size
    test_batches_len = math.floor(num_batches * .2)
    val_batches_len = math.floor(num_batches * .1)
    train_batches_len = num_batches - test_batches_len - val_batches_len


class BatchedCode2VecDataset(torch.utils.data.IterableDataset):
    def __init__(self, vector_file: Path, batch_size: int, split: SplitKind):
        self.batch_size = batch_size
        self.dataset = get_data_split(vector_file, batch_size, split)
        self.max_methods = max_methods

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info:
            raise 'You can not use more than one worker on this dataset'


In [14]:

from typing import Generator

save_dir = Path.cwd() / 'save'
w2v_model_dir = Path.cwd() / '..' / 'data' / 'models'

logging.getLogger().setLevel('CRITICAL')
# Add node training dataset

strats = ['VC', 'LC', 'FC', 'NC']
db_shapes = {}
with db_connect() as conn:
    set_shapes(conn, db_shapes)

DatasetType = tuple[BatchedProgramDataset, BatchedProgramDataset, tuple[int, int, int, int], tuple[int, int, int, int], str, str]

def get_graph_datasets(batch_size: int, shapes: dict[(str, bool), tuple[int, int, int, int]]) -> Generator[DatasetType, None, None]:
    for embed_method, strat in embedding_methods:
        train_shape = shapes[(strat, True)]
        test_shape = shapes[(strat, False)]
        train_dataset = BatchedProgramDataset(strat, str(w2v_model_dir / f'{strat}-{embed_method}.model'), batch_size, train_shape[0], train_shape[1], train_shape[2], True)
        test_dataset = BatchedProgramDataset(strat, str(w2v_model_dir / f'{strat}-{embed_method}.model'), batch_size, test_shape[0], test_shape[1], test_shape[2], False)
        yield train_dataset, test_dataset, train_shape, test_shape, strat, embed_method

DS_BATCH_SIZE = 20
datasets = [(train, test, train_shape, test_shape, strat, embed_method) for train, test, train_shape, test_shape, strat, embed_method in get_graph_datasets(DS_BATCH_SIZE, db_shapes)]

# Train Model

In [15]:
len(datasets[0][0])
datasets

[(<__main__.BatchedProgramDataset at 0x7fafa6467760>,
  <__main__.BatchedProgramDataset at 0x7fafa64655a0>,
  (282, 4, 29, 50),
  (70, 4, 29, 50),
  'FC',
  'deepwalk'),
 (<__main__.BatchedProgramDataset at 0x7fafa64653c0>,
  <__main__.BatchedProgramDataset at 0x7fafa64652d0>,
  (273, 4, 29, 50),
  (79, 4, 29, 50),
  'VC',
  'deepwalk'),
 (<__main__.BatchedProgramDataset at 0x7fafa6465390>,
  <__main__.BatchedProgramDataset at 0x7fafa64651e0>,
  (284, 4, 29, 50),
  (68, 4, 29, 50),
  'LC',
  'deepwalk'),
 (<__main__.BatchedProgramDataset at 0x7fafa6465300>,
  <__main__.BatchedProgramDataset at 0x7fafa64651b0>,
  (273, 4, 29, 50),
  (79, 4, 29, 50),
  'NC',
  'deepwalk'),
 (<__main__.BatchedProgramDataset at 0x7fafa6465360>,
  <__main__.BatchedProgramDataset at 0x7fafa6465210>,
  (282, 4, 29, 50),
  (70, 4, 29, 50),
  'FC',
  'node2vec'),
 (<__main__.BatchedProgramDataset at 0x7fafa6464cd0>,
  <__main__.BatchedProgramDataset at 0x7fafa6467820>,
  (273, 4, 29, 50),
  (79, 4, 29, 50),
  '

In [24]:
logging.getLogger().setLevel('CRITICAL')

pth_model_dir = Path.cwd() / 'models'

import math

def train_loop(dataset, model, loss_function, optimizer):
    size = len(dataset)
    batch = 0
    input_tensor = None
    loss = None
    for X, y in torch.utils.data.DataLoader(dataset, num_workers = 0):
        pred = model(X)
        logging.debug(pred)
        logging.debug(y)
        loss = loss_function(pred.squeeze(), y.squeeze())
        input_tensor = X

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        logging.debug(loss)
        if batch % 10 == 0:
            batch_loss, current = loss.item(), batch * len(X)
            logging.info(f'loss: {batch_loss:>7f} [{current:>5d}/{size:>5d}]')
        batch += 1

    loss, current = loss.item(), batch * len(input_tensor)
    logging.info(f'loss: {loss:>7f} [{current:>5d}/{size:>5d}]')

def val_loop(dataset, model, loss_function):
    dataloader = torch.utils.data.DataLoader(dataset, num_workers = 0)
    num_batches = len(dataloader)
    test_loss = 0
    sig = nn.Sigmoid()
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            pred = sig(pred)
            test_loss += loss_function(pred.squeeze(), y.squeeze()).item()

    test_loss /= num_batches
    logging.info(f'Test Error: \nAvg loss: {test_loss:>8f}\n')
    return test_loss

epochs_list = [5, 10, 50, 100, 500, 1000]
learning_rates = [5, 1, 1e-1, 1e-2, 1e-5]
final_scores = {}
for id, (epochs, lr) in enumerate(itertools.product(epochs_list, learning_rates)):
    for train, test, train_shape, test_shape, strat, embed_method in datasets:
        logging.critical(f'{id}. Training {strat}-{embed_method} prediction model w/{epochs} epochs and a {lr} learning rate\n------------------------------')
        model = CfgMisconceptionClassifier(50).to(device)
        loss_function = torch.nn.MultiLabelSoftMarginLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr = lr)
        train = list(train)
        train_len = math.floor(len(train) * .875)
        val_len = len(train) - train_len
        for t in range(epochs):
            logging.info(f'Epoch {t}\n------------------------------')
            train_ds, val = torch.utils.data.random_split(train, [train_len, val_len], generator=torch.Generator().manual_seed(train_len))
            train_loop(train_ds, model, loss_function, optimizer)
            val_loss = val_loop(val, model, loss_function)

        final_scores[(id, epochs, lr, f'{strat}-{embed_method} avg loss')] = val_loop(test, model, loss_function)
        torch.save(model.state_dict(), pth_model_dir / f'{strat}_{embed_method}_e{epochs}_lr{lr:e}_{id}_sigmoid_mi_weights.pth')
        del model

import pprint
pprint.pprint(final_scores)

CRITICAL:root:0. Training FC-deepwalk prediction model w/5 epochs and a 5 learning rate
------------------------------
CRITICAL:root:0. Training VC-deepwalk prediction model w/5 epochs and a 5 learning rate
------------------------------
CRITICAL:root:0. Training LC-deepwalk prediction model w/5 epochs and a 5 learning rate
------------------------------
CRITICAL:root:0. Training NC-deepwalk prediction model w/5 epochs and a 5 learning rate
------------------------------
CRITICAL:root:0. Training FC-node2vec prediction model w/5 epochs and a 5 learning rate
------------------------------
CRITICAL:root:0. Training VC-node2vec prediction model w/5 epochs and a 5 learning rate
------------------------------
CRITICAL:root:0. Training LC-node2vec prediction model w/5 epochs and a 5 learning rate
------------------------------
CRITICAL:root:0. Training NC-node2vec prediction model w/5 epochs and a 5 learning rate
------------------------------
CRITICAL:root:1. Training FC-deepwalk prediction

{(0, 5, 5, 'FC-deepwalk avg loss'): 0.4711681604385376,
 (0, 5, 5, 'FC-node2vec avg loss'): 0.4733625253041585,
 (0, 5, 5, 'LC-deepwalk avg loss'): 0.5048283239205679,
 (0, 5, 5, 'LC-node2vec avg loss'): 0.5062294900417328,
 (0, 5, 5, 'NC-deepwalk avg loss'): 0.4556235869725545,
 (0, 5, 5, 'NC-node2vec avg loss'): 0.4549266993999481,
 (0, 5, 5, 'VC-deepwalk avg loss'): 0.4973299602667491,
 (0, 5, 5, 'VC-node2vec avg loss'): 0.49885615706443787,
 (1, 5, 1, 'FC-deepwalk avg loss'): 0.4712962806224823,
 (1, 5, 1, 'FC-node2vec avg loss'): 0.4885110358397166,
 (1, 5, 1, 'LC-deepwalk avg loss'): 0.5046083430449168,
 (1, 5, 1, 'LC-node2vec avg loss'): 0.5192470848560333,
 (1, 5, 1, 'NC-deepwalk avg loss'): 0.45274584492047626,
 (1, 5, 1, 'NC-node2vec avg loss'): 0.47166235248247784,
 (1, 5, 1, 'VC-deepwalk avg loss'): 0.4977423946062724,
 (1, 5, 1, 'VC-node2vec avg loss'): 0.5144098500410715,
 (2, 5, 0.1, 'FC-deepwalk avg loss'): 0.48950116833051044,
 (2, 5, 0.1, 'FC-node2vec avg loss'): 0.49

In [None]:
{k: v for k, v in sorted(final_scores.items(), key=lambda item: item[1])}

In [65]:
def test_loop(dataset, model, loss_function, most_common_class):
    dataloader = torch.utils.data.DataLoader(dataset, num_workers = 0)
    num_batches = len(dataloader)
    total_programs = num_batches * dataset.batch_size
    test_loss = 0
    c_test_loss = 0
    counts = []
    c_counts = []
    mcc_tensor = torch.as_tensor(most_common_class).to(device)
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            full_prediction = np.around(pred.squeeze().cpu().numpy())
            actual = y.squeeze().cpu().numpy()
            counts.append(np.count_nonzero(np.equal(full_prediction, actual), axis=0))
            c_counts.append(np.count_nonzero(np.equal(most_common_class, actual), axis=0))
            test_loss += loss_function(pred.squeeze(), y.squeeze()).item()
            c_test_loss += loss_function(mcc_tensor, y.squeeze()).item()

    count_arr = np.array(counts)
    count_arr = count_arr.sum(axis=0, dtype=np.float32)
    count_arr /= float(total_programs)
    test_loss /= num_batches

    c_count_arr = np.array(c_counts)
    c_count_arr = c_count_arr.sum(axis=0, dtype=np.float32)
    c_count_arr /= float(total_programs)
    c_test_loss /= num_batches
    logging.info(f'Test Error: \nAvg loss: {test_loss:>8f}\n')
    return np.append(count_arr, test_loss), np.append(c_count_arr, c_test_loss)

epochs_list = [5, 10, 50, 100, 500, 1000]
learning_rates = [5, 1, 1e-1, 1e-2, 1e-5]
final_scores = {}

dataframes = []

ncs = [ds for ds in datasets if ds[4] == 'NC']
data = []
columns = []
for train, test, train_shape, test_shape, strat, embed_method in ncs:
    for id, (epochs, lr) in enumerate(itertools.product(epochs_list, learning_rates)):
        model = CfgMisconceptionClassifier(50).to(device)
        model_dict = torch.load(pth_model_dir / f'{strat}_{embed_method}_e{epochs}_lr{lr:e}_{id}_sigmoid_mi_weights.pth')
        model.load_state_dict(model_dict)
        loss_function = torch.nn.MultiLabelSoftMarginLoss()
        pred_result, common_result = test_loop(test, model, loss_function, np.ones((test.batch_size, 11)))
        data.append(pred_result)
        columns.append(f'{strat}-{embed_method}-E{epochs}-LR{lr}-{id}')

nc_results = pd.DataFrame(data=data + [common_result], columns=tags_titles + ['Loss'], index=columns + ['Assume Most Common Class'])

In [66]:
pd.DataFrame((nc_results.drop(columns=['Loss']).sum(1) / len(nc_results.columns)), columns=['Accuracy']).merge(nc_results['Loss'], left_index=True, right_index=True)

Unnamed: 0,Accuracy,Loss
NC-deepwalk-E5-LR5-0,0.780556,0.416198
NC-deepwalk-E5-LR1-1,0.844444,0.400739
NC-deepwalk-E5-LR0.1-2,0.550000,0.509405
NC-deepwalk-E5-LR0.01-3,0.468056,0.513663
NC-deepwalk-E5-LR1e-05-4,0.516667,0.513796
...,...,...
NC-node2vec-E1000-LR1-26,0.844444,0.398353
NC-node2vec-E1000-LR0.1-27,0.844444,0.398577
NC-node2vec-E1000-LR0.01-28,0.844444,0.453816
NC-node2vec-E1000-LR1e-05-29,0.569444,0.512059


In [56]:
display(nc_results)
mcc_results

Unnamed: 0,Does not calculate the number of digits,Reads positive numbers and continues reading,Can handle INT MAX and get the correct answer,Did not test negative numbers,Output matches correct implementation,Compiles,Reads positive and negative numbers and continues reading until -1,Does not double 0 into 00,Does not use System.exit to leave the program,Can handle INT MAX,Scanner used correctly,Loss
NC-deepwalk-E5-LR5-0,0.95,0.966667,0.95,0.816667,0.9,1.0,0.783333,0.95,0.116667,0.966667,0.966667,0.416198
NC-deepwalk-E5-LR1-1,0.95,0.966667,0.95,0.816667,0.9,1.0,0.783333,0.95,0.883333,0.966667,0.966667,0.400739
NC-deepwalk-E5-LR0.1-2,0.05,0.966667,0.95,0.183333,0.9,0.866667,0.783333,0.95,0.883333,0.033333,0.033333,0.509405
NC-deepwalk-E5-LR0.01-3,0.05,0.033333,0.95,0.816667,0.9,0.0,0.783333,0.85,0.883333,0.316667,0.033333,0.513663
NC-deepwalk-E5-LR1e-05-4,0.95,0.033333,0.05,0.183333,0.9,1.0,0.216667,0.05,0.883333,0.966667,0.966667,0.513796
NC-deepwalk-E10-LR5-5,0.95,0.966667,0.95,0.816667,0.9,1.0,0.783333,0.95,0.883333,0.966667,0.966667,0.39503
NC-deepwalk-E10-LR1-6,0.95,0.966667,0.95,0.816667,0.9,1.0,0.783333,0.95,0.883333,0.966667,0.966667,0.400548
NC-deepwalk-E10-LR0.1-7,0.95,0.966667,0.85,0.466667,0.766667,0.95,0.783333,0.95,0.883333,0.033333,0.966667,0.505446
NC-deepwalk-E10-LR0.01-8,0.05,0.033333,0.95,0.183333,0.116667,1.0,0.216667,0.95,0.883333,0.033333,0.933333,0.512004
NC-deepwalk-E10-LR1e-05-9,0.95,0.033333,0.483333,0.816667,0.1,0.0,0.783333,0.05,0.883333,0.966667,0.033333,0.515393


Unnamed: 0,0
Does not calculate the number of digits,0.95
Reads positive numbers and continues reading,0.966667
Can handle INT MAX and get the correct answer,0.95
Did not test negative numbers,0.816667
Output matches correct implementation,0.9
Compiles,1.0
Reads positive and negative numbers and continues reading until -1,0.783333
Does not double 0 into 00,0.95
Does not use System.exit to leave the program,0.883333
Can handle INT MAX,0.966667


In [None]:
c2b_misconceptions = [
    'Calculates the number of digits in the binary representation',
    'Converts the number to a decimal representation of the binary number',
    'Multiplies up to the number instead of using division',
    'Conflates do-while and while loops',
    'Did not handle negative numbers',
]

main_misconceptions = [
    'Does not continue reading when given a positive number',
    'Does not use proper control flow to finish the program',
    'Uses Scanner.next() instead of Scanner.nextInt()',
    'Does not quit when given -1',
    'Does not reprompt user when given any other negative number',
]

all_misconceptions = [
    (idx + 1, mis, t) for idx, (mis, t) in itertools.chain(enumerate(zip(c2b_misconceptions, itertools.repeat('convertToBinary'))), enumerate(zip(main_misconceptions, itertools.repeat('main'))))
]
tags = pd.DataFrame(all_misconceptions, columns=['Number', 'Misconception', 'Method'])
print(tags.set_index(['Method', 'Number']).style.to_latex())

In [None]:
tests_c2b = [
    ('Does not use the Math.pow() function', ['!1', '!3']),
    ('Can handle INT MAX and get the correct answer', ['!1', '!2', '!3', '!4']),
    ('Did not test negative numbers', ['5']),
    ('Does not double 0 into 00', ['!4']),
    ('Can handle INT MAX', ['!1', '!2', '!3'])
]

tests_main = [
    ('Reads positive numbers and continues reading', ['!1', '!4']),
    ('Reads positive and negative numbers and continues reading until -1', ['!1', '!4', '!5']),
    ('Does not use System.exit to leave the program', ['!2']),
    ('Scanner used correctly', ['!3'])
]

all_tests = [
    (idx + 1, t, method, ', '.join(implications)) for idx, ((t, implications), method) in itertools.chain(enumerate(zip(tests_c2b, itertools.repeat('convertToBinary'))), enumerate(zip(tests_main, itertools.repeat('main'))))
]
tags = pd.DataFrame(all_tests, columns=['Number', 'Test', 'Method', 'Implied Misconceptions'])
print(tags.set_index(['Method', 'Number']).style.to_latex())

In [38]:
tag_df = pd.DataFrame(tags)

pd.DataFrame(pd.DataFrame(tag_df.transpose()['target'].to_list(), columns=tags_titles).sum(0) / len(tags), columns=['% of submissions that passed this test'])

Unnamed: 0,% of submissions that passed this test
Does not calculate the number of digits,0.907821
Reads positive numbers and continues reading,0.874302
Can handle INT MAX and get the correct answer,0.868715
Did not test negative numbers,0.734637
Output matches correct implementation,0.874302
Compiles,0.97486
Reads positive and negative numbers and continues reading until -1,0.712291
Does not double 0 into 00,0.882682
Does not use System.exit to leave the program,0.703911
Can handle INT MAX,0.891061
