In [1067]:
import os
import shutil
import pandas as pandas
import numpy as numpy
from dotenv import load_dotenv
import matplotlib.pyplot as plt

from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score
import torch
import torch.nn as nn
load_dotenv()

True

In [1068]:
TEST_SIZE = 0.2
RANDOM_STATE = 42

# These are the columns the program will extract mathematical features from.
# FORMAT: You can list multiple columns by separating the names with a comma
COLUMNS_WITH_EQUATIONS = "lEquations"

# When generating the geq columns, this constant will define the number of geq columns generated for each math feature
NUM_OF_SYMBOL_MAX_VALUE = 8

CORRECT_COLUMN = "is_correct"
VALID_COLUMN = "valid"

# These are just constants which define what sort of values we're looking for and the column which indicates whether a row is valid
CORRECT_COLUMN = "is_correct"
VALID_COLUMN = "valid"

QUESTION_NO = 'question_No'

# Some string consts so the column names can be modified a bit easier
NUM_OF_ADDITION_SUFFIX = "_num_of_addition"
NUM_OF_SUBTRACTION_SUFFIX = "_num_of_subtraction"
NUM_OF_ADDITION_AND_SUBTRACTION_SUFFIX = "_num_of_addition_and_subtraction"
NUM_OF_MULTIPLICATION_SUFFIX = "_num_of_multiplication"
NUM_OF_DIVISION_SUFFIX = "_num_of_division"
NUM_OF_MULTIPLICATION_AND_DIVISION_SUFFIX = "_num_of_multiplication_and_division"
NUM_OF_EQUATIONS_SUFFIX = "_num_of_equations"


COLUMN_TO_CHECK = "result"

MAX_NAME_DIFFERENCE = 3

# Various constants with text defined by Abhinav
# These constants are here so that, if Abhinav changes his mind on what the strings will look like, 
# the information is centralized
ALL_ANSWERS = "has all the answers"
ALL_ANSWERS_ROUNDED = "has all the answers when rounded"
SOME_SOLUTION = "has one or more of the answers, but not all of them"
SOME_SOLUTION_ROUNDED = "has one or more of the answers when rounded, but not all of them"
NO_SOLUTION = "says no solution"
INVALID = "invalid"

In [1069]:
def num_of_symbol(equations, symbol):
    # Calculates the number of '+' symbols in [equations]
    # INPUT: [equations] should be an array of strings which represent equations
    # INPUT: [symbol] should be a character.
    count = 0
    for equation in equations:
        count += equation.count(symbol)
    return count

    """"""
def num_of_addition(equations):
    # Calculates the number of '+' symbols in [equations]
    # INPUT: [equations] should be an array of strings which represent equations
    return num_of_symbol(equations, '+')
    
    """"""
def num_of_subtraction(equations): 
    # Calculates the number of '-' symbols in [equations]
    # INPUT: [equations] should be an array of strings which represent equations
    return num_of_symbol(equations, '-')

    """"""
def num_of_multiplication(equations):
    # Calculates the number of '*' symbols in [equations]
    # INPUT: [equations] should be an array of strings which represent equations
    return num_of_symbol(equations, '*')

    """"""
def num_of_division(equations):
    # Calculates the number of '/' symbols in [equations]
    # INPUT: [equations] should be an array of strings which represent equations
    return num_of_symbol(equations, '/')

    """"""
def num_of_equations(equations):
    # Calculates the number of equations in [equations]
    # INPUT: [equations] should be an array of strings which represent equations
    return len(equations)

    """"""
def generate_geq_columns(data, column, max_value):
    # Generates greater than or equal binary columns of a mathematical feature.
    # Once we calculate the number (amount) of each mathematical feature, 
    # we generate columns of 0s and 1s representing whether the equations have greater than or equal
    # number of a particular feature
    for value in range(1, max_value + 1):
        data[column + "_geq_" + str(value)] = generate_geq_column(data, column, value)
    return data

    """"""
def generate_geq_column(data, column, value):
    # Generates a column of 0s and 1s which represents whether or not the rows in data[column] are >= value. 1 means True and 0 means False 
    return data.apply(lambda row : 1 if row[column] >= value else 0, axis=1)
    
    """"""

In [1070]:
def extract_features(input_file_path, output_file_path):
    # Load data from files
    data = pandas.read_json(input_file_path)

    # Calculate the number (amount) of each particular mathematical features
    # We calculate the number of additions (+), subtractions (-), multiplications (*), divisions (*) and equations.
    columns_with_equations = COLUMNS_WITH_EQUATIONS.split(',')
    for column in columns_with_equations:
        data[column + NUM_OF_ADDITION_SUFFIX] = data.apply(lambda row : num_of_addition(row[column]), axis=1)
        data[column + NUM_OF_SUBTRACTION_SUFFIX] = data.apply(lambda row : num_of_subtraction(row[column]), axis=1)
        data[column + NUM_OF_MULTIPLICATION_SUFFIX] = data.apply(lambda row : num_of_multiplication(row[column]), axis=1)
        data[column + NUM_OF_DIVISION_SUFFIX] = data.apply(lambda row : num_of_division(row[column]), axis=1)
        data[column + NUM_OF_EQUATIONS_SUFFIX] = data.apply(lambda row : num_of_equations(row[column]), axis=1)
        
        data[column + NUM_OF_ADDITION_AND_SUBTRACTION_SUFFIX] = data[column + NUM_OF_ADDITION_SUFFIX] + data[column + NUM_OF_SUBTRACTION_SUFFIX]
        data[column + NUM_OF_MULTIPLICATION_AND_DIVISION_SUFFIX] = data[column + NUM_OF_MULTIPLICATION_SUFFIX] + data[column + NUM_OF_DIVISION_SUFFIX]

        # data = generate_geq_columns(data, column + NUM_OF_ADDITION_SUFFIX, NUM_OF_SYMBOL_MAX_VALUE)
        # data = generate_geq_columns(data, column + NUM_OF_SUBTRACTION_SUFFIX, NUM_OF_SYMBOL_MAX_VALUE)
        data = generate_geq_columns(data, column + NUM_OF_ADDITION_AND_SUBTRACTION_SUFFIX, NUM_OF_SYMBOL_MAX_VALUE)

        # data = generate_geq_columns(data, column + NUM_OF_MULTIPLICATION_SUFFIX, NUM_OF_SYMBOL_MAX_VALUE)
        # data = generate_geq_columns(data, column + NUM_OF_DIVISION_SUFFIX, NUM_OF_SYMBOL_MAX_VALUE)
        data = generate_geq_columns(data, column + NUM_OF_MULTIPLICATION_AND_DIVISION_SUFFIX, NUM_OF_SYMBOL_MAX_VALUE)

        data = generate_geq_columns(data, column + NUM_OF_EQUATIONS_SUFFIX, NUM_OF_SYMBOL_MAX_VALUE)

    # Save data to output file
    data.to_json(output_file_path, orient='records')

In [1071]:
def negation(column):
    # negation -- 
    # OUTPUT: returns a column of 0s and 1s of the negation of [column]. 1s are flipped to 0 and vice versa
    # INPUT: [column] should be a column of 0s and 1s
    return 1 - column
    
    """"""
def conjunction(column_1, column_2):
    # conjunction -- 
    # output: returns a column of 0s and 1s of the conjunction between [column_1] and [column_2].
    # INPUT: [column_1] and [column_2] should be columns of 0s and 1s
    return column_1 * column_2
    
    """"""
def disjunction(column_1, column_2):
    # disjunction -- 
    # OUTPUT: returns a column of 0s and 1s of the disjunction between [column_1] and [column_2].
    # INPUT: [column_1] and [column_2] should be columns of 0s and 1s
    return column_1 | column_2
    
    """"""
def conditional_probability(occurence_column, condition_column):
    # conditional_probability -- 
    # OUTPUT: returns a number which represents the conditional probability p(occurence | condition)
    # INPUT: [occurence_column] and [condition_column] should be columns of 0s and 1s
    return conjunction(occurence_column, condition_column).sum() / condition_column.sum()
    
    """"""
def prior(data):
    # prior -- 
    # OUTPUT: returns a number which represents the prior
    # INPUT: [data] should be a Pandas dataframe with the columns [CORRECT_COLUMN] and [VALID_COLUMN].
    # TODO : Possible optimizations can be made where we cache the result instead of calling this expensive operation again and again
    return conditional_probability(data[CORRECT_COLUMN], data[VALID_COLUMN])
    
    """"""
def is_prima_facie(data, column_name):
    # is_prima_facie -- 
    # OUTPUT: returns a boolean which determines whether the column indicated by [column_name] is a prima facie
    # INPUT: [data] should be a Pandas dataframe with the columns [CORRECT_COLUMN] and [VALID_COLUMN].
    # INPUT: [column_name] should be a valid column in [data]
    # INPUT: The [CORRECT_COLUMN] and [VALID_COLUMN] columns should be columns of 0s and 1s 
    return conditional_probability(data[CORRECT_COLUMN], data[column_name]) > prior(data)
    
    """"""
def is_cooccur(column_1, column_2):
    # is_cooccur -- 
    # OUTPUT: returns a boolean based on if there is at least one row where both [column_1] and [column_2] is equal to 1
    # INPUT: [column_1] and [column_2] should both be columns of 0s and 1s
    return conjunction(column_1, column_2).sum() > 0
    
    """"""
def is_same_category(column_name_1, column_name_2):
    # same_category -- 
    # OUTPUT: Returns a boolean signifying whether the [column_name_1] and [column_name_2] are different by [MAX_NAME_DIFFERENCE]
    #         If the two words are not different by [MAX_NAME_DIFFERENCE], they are in the same category so it returns true
    count = 0
    shortest = min(len(column_name_1), len(column_name_2))
    for i in range(0, shortest):
        if column_name_1[i] == column_name_2[i]:
            count = count + 1
    return count < MAX_NAME_DIFFERENCE
    
    """"""
def rel(data, column_name):
    # rel -- 
    # OUTPUT: returns a list of the names of other columns which cooccur with [column_name] and are prima facie
    # INPUT: [data] should be a Pandas dataframe with the columns [CORRECT_COLUMN] and [VALID_COLUMN].
    # INPUT: [column_name] should be a valid column in [data]
    # INPUT: The [CORRECT_COLUMN] and [VALID_COLUMN] columns should be columns of 0s and 1s 
    # If it is not a prima facie cause, we don't bother to find its rel
    if not is_prima_facie(data,column_name): return[]
    
    name_list = []
    for potential_cause in data.columns:
        # Make sure we are not including the [CORRECT_COLUMN] and [VALID_COLUMN] as part of rel
        if potential_cause == CORRECT_COLUMN or potential_cause == VALID_COLUMN:
            continue

        if is_same_category(potential_cause, column_name): continue

        if is_cooccur(data[column_name], data[potential_cause]) and is_prima_facie(data, potential_cause):
            name_list.append(potential_cause)
    return name_list
    
    """"""
def calculate_causality(data, column_name):
    # calculate_causality -- 
    # OUTPUT: returns a number which represents the causality value of the column indicated by [column_name]
    # INPUT: [data] should be a Pandas dataframe with the columns [CORRECT_COLUMN].
    # INPUT: [column_name] should be a valid column in [data]
    # INPUT: The [CORRECT_COLUMN] and [VALID_COLUMN] columns should be columns of 0s and 1s 

    # If it's not a prima facie cause, we don't bother to calculate its causality value
    if not is_prima_facie(data, column_name):
        return "n/a"

    relateds = rel(data, column_name)

    total_probability = 0
    for related in relateds:
        conj = conjunction(data[column_name], data[related])
        negj = conjunction(negation(data[column_name]), data[related])

        conj = conditional_probability(data[CORRECT_COLUMN], conj)
        negj = conditional_probability(data[CORRECT_COLUMN], negj)

        total_probability += (conj - negj)

    if (len(relateds) > 0): return total_probability / len(relateds)
    else: return 0
    
    """"""
def is_binary_column(data, column_name):
    # is_binary_column --
    # Checks to see if a column is a column of 1s and 0s
    # INPUT: [data] is a dataframe
    # INPUT: [column_name] should be the name of a valid column in [data]
    return data.apply(lambda row : 0 if (isinstance(row[column_name], int) and (row[column_name] <= 1)) else 1, axis=1).sum() <= 0
    
    """"""
def remove_non_binary_columns(data):
    # remove_non_binary_columns --
    # Removes all columns that are not 0s or 1s in the dataset
    # INPUT: [data] is a dataframe
    non_binary = []
    for i in data.columns:
        if not is_binary_column(data, i):
            non_binary.append(i)

    return data.drop(columns=non_binary)
    
    """"""
def generate_row(data, column_name):
    # generate_row --
    # TODO: This is kind of a terrible name but I can't really think of anything more descriptive. If anyone has any ideas, feel free to modify it
    # It basically creates a row, which is actually a data frame with all the data that is needed
    # OUTPUT: It outputs a row with all the required values
    # INPUT: [data] should be a dataframe
    # INPUT: [column_name] should be a string representing a valid column in [data]
    toReturn = pandas.DataFrame({
        "name": [column_name], 
        "support": conjunction(data[column_name], data[VALID_COLUMN]).sum(),
        "causality": calculate_causality(data, column_name),
        "rel": ','.join(rel(data, column_name)),
        "conditional_probability":[conditional_probability(data[CORRECT_COLUMN], data[column_name])], 
        "prior": prior(data),
        "conditional - prior": conditional_probability(data[CORRECT_COLUMN], data[column_name]) - prior(data)
    })
    return toReturn
    
    """"""

In [1072]:
def combine(combined_output_file_path, chatgpt_file_path, output_file_path, is_correct, is_valid):
    problems = pandas.read_json(combined_output_file_path)
    chatgpt = pandas.read_json(chatgpt_file_path)

    for column in chatgpt.columns:
        if column == QUESTION_NO:
            continue

        problems.loc[chatgpt[QUESTION_NO], column] = chatgpt[column]

    problems[CORRECT_COLUMN] = problems.apply(lambda row : is_correct(row),axis=1)
    problems[VALID_COLUMN] = problems.apply(lambda row : is_valid(row),axis=1)

    problems.to_json(output_file_path, orient='records')

In [1073]:
def causality_values(input_file_path, output_file_path):
    # causality_values --
    # Calculates causality values

    # Load data
    data = pandas.read_json(input_file_path)

    # Then remove all the non binary columns
    data = remove_non_binary_columns(data)

    # TODO: I'm not sure if there's another way to do this, so feel free to make modifications
    # Generate a dud data frame with a single so we can append to it.
    to_save = generate_row(data, VALID_COLUMN)
    for column in data.columns:
        if column == VALID_COLUMN or column == CORRECT_COLUMN:
            continue

        to_save = to_save.append(generate_row(data, column))

    # Remove the dud first row
    to_save = to_save[1:]

    to_save.to_json(output_file_path, orient='records')

In [1074]:
def chatgpt_stats(input_file_path, output_file_path):
    data = pandas.read_json(input_file_path)
    count = data['result'].value_counts().rename_axis('value').reset_index(name='count')
    count.to_json(output_file_path, orient='records')

In [1075]:
def split_dataset(input_file_path, output_file_path, percent_correct):
    data = pandas.read_json(input_file_path)

    correct_data = data[data[CORRECT_COLUMN] == 1]
    wrong_data = data[data[CORRECT_COLUMN] == 0]

    correct_len = min(len(correct_data.index), len(wrong_data.index))
    wrong_len = int(correct_len / percent_correct * (1 - percent_correct))

    correct_data = correct_data.head(correct_len)
    wrong_data = wrong_data.head(wrong_len)

    correct_data = correct_data.append(wrong_data)
    correct_data = correct_data.sample(frac = 1, random_state=42)

    correct_data.to_json(output_file_path, orient='records')


In [1076]:
def random_forest(name, input_file_path, output_file_path):
    data = pandas.read_json(input_file_path)
    data = remove_non_binary_columns(data)
    
    data_y = data[CORRECT_COLUMN]
    data_x = data.drop(columns=[CORRECT_COLUMN, VALID_COLUMN])

    output =  pandas.DataFrame({
        "fold": [-1],
        "guessed " + name  + " precision score": [0],
        "guessed " + name  + " recall score": [0],
        "guessed not " + name  + " precision score": [0],
        "guessed not " + name  + " recall score": [0],
    })

    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", RandomForestClassifier())
    ])
    # Split the data set into training and test set using Stratified Sampling
    split = StratifiedKFold(n_splits=5,random_state=RANDOM_STATE, shuffle=True)
    current_fold = 0
    for train_index, test_index in split.split(data_x, data_y):
        strat_train_set_x, strat_train_set_y = data_x.loc[train_index], data_y.loc[train_index]
        strat_test_set_x, strat_test_set_y = data_x.loc[test_index], data_y.loc[test_index]

        # Perform grid search to tune hyperparameters
        param_grid = {
            "clf__n_estimators": [100, 500, 1000],
            "clf__max_depth": [1, 5, 10, 25],
            "clf__max_features": [*numpy.arange(0.1, 1.1, 0.1)],
        }

        pipe = clone(pipeline)

        pipe.fit(strat_train_set_x, strat_train_set_y)
        pipe_predict_y = pipe.predict(strat_test_set_x)

        auc = metrics.roc_auc_score(strat_test_set_y, pipe_predict_y)

        current_fold = current_fold + 1

        output = output.append(pandas.DataFrame({
            "fold": [current_fold],
            "guessed " + name  + " precision score": [precision_score(strat_test_set_y, pipe_predict_y)],
            "guessed " + name  + " recall score": [recall_score(strat_test_set_y, pipe_predict_y)],
            "guessed not " + name  + " precision score": [precision_score(negation(strat_test_set_y), negation(pipe_predict_y))],
            "guessed not " + name  + " recall score": [recall_score(negation(strat_test_set_y), negation(pipe_predict_y))],
        }))
    
    output = output[1:]

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data, data[CORRECT_COLUMN]):

        pipe = clone(pipeline)
        
        pipe.fit(strat_train_set_x, strat_train_set_y)
        pipe_predict_y = pipe.predict(strat_test_set_x)

        auc = metrics.roc_auc_score(strat_test_set_y, pipe_predict_y)

        current_fold = current_fold + 1

        output = output.append(pandas.DataFrame({
            "fold": ["full dataset" + str(current_fold)],
            "guessed " + name  + " precision score": [precision_score(strat_test_set_y, pipe_predict_y)],
            "guessed " + name  + " recall score": [recall_score(strat_test_set_y, pipe_predict_y)],
            "guessed not " + name  + " precision score": [precision_score(negation(strat_test_set_y), negation(pipe_predict_y))],
            "guessed not " + name  + " recall score": [recall_score(negation(strat_test_set_y), negation(pipe_predict_y))],
        }))

    output.to_json(output_file_path, orient="records")

In [1077]:
def neural_network(name, input_file_path, output_file_path):
        
    data = pandas.read_json(input_file_path)
    output = data[CORRECT_COLUMN]

    total_records = data[data[VALID_COLUMN] == 1][VALID_COLUMN].sum()

    data = remove_non_binary_columns(data)
    data = data.drop(columns=[CORRECT_COLUMN])

    numerical_data = numpy.stack([data[col].values for col in data.columns], 1)
    numerical_data = torch.tensor(numerical_data, dtype=torch.float)

    output = torch.tensor(output.values).flatten()

    test_records = int(total_records * .2)

    numerical_train_data = numerical_data[:total_records-test_records]
    categorical_train_data = pandas.DataFrame()
    numerical_test_data = numerical_data[total_records-test_records:total_records]
    train_outputs = output[:total_records-test_records]
    test_outputs = output[total_records-test_records:total_records]

    class Model(nn.Module):

        def __init__(self, num_numerical_cols, output_size, layers, p=0.4):
            super().__init__()
            self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)

            all_layers = []
            input_size = num_numerical_cols

            for i in layers:
                all_layers.append(nn.Linear(input_size, i))
                all_layers.append(nn.ReLU())
                input_size = i
            all_layers.append(nn.Softmax())
            self.layers = nn.Sequential(*all_layers)

        def forward(self, x_numerical):
            x_numerical = self.batch_norm_num(x_numerical)
            x = torch.tensor(x_numerical)
            x = self.layers(x)
            return x

    model = Model(numerical_data.shape[1], 2, [200,100,50], p=0.4)
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    epochs = 1000
    aggregated_losses = []

    for i in range(epochs):
        y_pred = model(numerical_train_data)
        single_loss = loss_function(y_pred, train_outputs)
        aggregated_losses.append(single_loss)

        # if i%25 == 1:
        #     print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

        optimizer.zero_grad()
        single_loss.backward()
        optimizer.step()

    with torch.no_grad():
        y_val = model(numerical_test_data)
        loss = loss_function(y_val, test_outputs)
    y_val = numpy.argmax(y_val, axis=1)
    print(y_val)

    output = pandas.DataFrame({
            "guessed " + name  + " precision score": [precision_score(y_val, test_outputs)],
            "guessed " + name  + " recall score": [recall_score(y_val, test_outputs)],
            "guessed not " + name  + " precision score": [precision_score(negation(y_val), negation(test_outputs))],
            "guessed not " + name  + " recall score": [recall_score(negation(y_val), negation(test_outputs))],
        })
    output.to_json(output_file_path, orient='records')

In [1078]:
def convert_json_to_csv(input_file_path):
    data = pandas.read_json(input_file_path)
    data.to_csv(os.path.splitext(input_file_path)[0]+'.csv')

In [1079]:
def execute(name, is_correct, is_valid):
    print(name, "*********************")
    folder_path = "../output/" + name.lower() + "/"
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    os.makedirs(folder_path)

    # Define the input file paths of the needed data. These should be in json format
    PROBLEMS_INPUT_FILE_PATH = '../input/draw.json'
    CHATGPT_INPUT_FILE_PATH = '../input/chatgpt.json'
    EXTRACT_FEATURES_OUTPUT_FILE_PATH = folder_path + '1_extract_features.json'
    COMBINE_OUTPUT_FILE_PATH = folder_path + '2_combine.json'
    CAUSALITY_OUTPUT_FILE_PATH = folder_path + '3_causality.json'
    CHATGPT_STATS_OUTPUT_FILE_PATH = folder_path + '4_chatgpt_stats.json'
    SPLIT_DATASET_OUTPUT_FILE_PATH = folder_path + '5_split_dataset.json'
    
    extract_features(PROBLEMS_INPUT_FILE_PATH, EXTRACT_FEATURES_OUTPUT_FILE_PATH)
    combine(EXTRACT_FEATURES_OUTPUT_FILE_PATH, CHATGPT_INPUT_FILE_PATH, COMBINE_OUTPUT_FILE_PATH, is_correct, is_valid)
    causality_values(COMBINE_OUTPUT_FILE_PATH, CAUSALITY_OUTPUT_FILE_PATH)
    chatgpt_stats(CHATGPT_INPUT_FILE_PATH, CHATGPT_STATS_OUTPUT_FILE_PATH)

    convert_json_to_csv(CAUSALITY_OUTPUT_FILE_PATH)
    convert_json_to_csv(CHATGPT_STATS_OUTPUT_FILE_PATH)

    RANDOM_FOREST_OUTPUT_FILE_PATH = folder_path + '6_random_forest_50-50.json'
    NEURAL_NETWORK_OUTPUT_FILE_PATH = folder_path + '7_neural_network_50-50.json'
    split_dataset(COMBINE_OUTPUT_FILE_PATH, SPLIT_DATASET_OUTPUT_FILE_PATH, percent_correct=0.5)
    random_forest(name, SPLIT_DATASET_OUTPUT_FILE_PATH, RANDOM_FOREST_OUTPUT_FILE_PATH)
    neural_network(name, SPLIT_DATASET_OUTPUT_FILE_PATH, NEURAL_NETWORK_OUTPUT_FILE_PATH)
    convert_json_to_csv(RANDOM_FOREST_OUTPUT_FILE_PATH)
    convert_json_to_csv(NEURAL_NETWORK_OUTPUT_FILE_PATH)

    RANDOM_FOREST_OUTPUT_FILE_PATH = folder_path + '6_random_forest_15-85.json'
    NEURAL_NETWORK_OUTPUT_FILE_PATH = folder_path + '7_neural_network_15-85.json'
    split_dataset(COMBINE_OUTPUT_FILE_PATH, SPLIT_DATASET_OUTPUT_FILE_PATH, percent_correct=0.15)
    random_forest(name, SPLIT_DATASET_OUTPUT_FILE_PATH, RANDOM_FOREST_OUTPUT_FILE_PATH)
    neural_network(name, SPLIT_DATASET_OUTPUT_FILE_PATH, NEURAL_NETWORK_OUTPUT_FILE_PATH)
    convert_json_to_csv(RANDOM_FOREST_OUTPUT_FILE_PATH)
    convert_json_to_csv(NEURAL_NETWORK_OUTPUT_FILE_PATH)

    # CLEANUP INTERMEDIATE FILES
    os.remove(EXTRACT_FEATURES_OUTPUT_FILE_PATH)
    os.remove(COMBINE_OUTPUT_FILE_PATH)
    os.remove(SPLIT_DATASET_OUTPUT_FILE_PATH)

In [1080]:
# is_valid --
# Checks to see if a row is "valid". We define what's "valid" here
# INPUT: [row] is a dict
def is_valid(row):
    return row[COLUMN_TO_CHECK] != INVALID

def column_check(correct_solutions, row):
    if row[COLUMN_TO_CHECK] in correct_solutions: return 0
    else: return 1

def not_somewhat_or_fully_correct(row):
    return column_check([ALL_ANSWERS, ALL_ANSWERS_ROUNDED, SOME_SOLUTION, SOME_SOLUTION_ROUNDED], row)
    
def not_fully_correct(row):
    return column_check([ALL_ANSWERS, ALL_ANSWERS_ROUNDED], row)

execute("Not_Somewhat_Or_Fully_Correct", not_somewhat_or_fully_correct, is_valid)
execute("Not_Fully_Correct", not_fully_correct, is_valid)


Not_Somewhat_Or_Fully_Correct *********************


  input = module(input)


tensor([0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
        1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
        1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1])


  input = module(input)


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Not_Fully_Correct *********************


  _warn_prf(average, modifier, msg_start, len(result))
  input = module(input)


tensor([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        1, 1, 0, 1, 0, 1])


  input = module(input)


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])


  _warn_prf(average, modifier, msg_start, len(result))
