# Extract features
### 1. Extract data from files into pandas dataframe

In [None]:

import pandas as pd

# The file paths of all the data sets
INPUT_FILE_PATH = "../input/draw.json"
OUTPUT_FILE_PATH = "../output/1_extract_features.json"

In [None]:
# Creating data frames based on information in the files
data = pd.read_json(INPUT_FILE_PATH)

# Make sure json datasets were loaded correctly
data

### 2. Extract attributes from equations

In [None]:
# Functions to extract various attributes from math equations

# This is the main equation, the others simply pass in a few parameters to calculate 
# the number of occurences of a particular character in a row and column
def operator_count(row, column, operator):
    equations = row[column]
    count = 0
    for equation in equations:
        count += equation.count(operator)
    return count
        
def lEquations_count_multiplication(row):
    return operator_count(row, "lEquations", '*')

def lEquations_count_addition(row):
    return operator_count(row, "lEquations", '+')

def lEquations_count_subtraction(row):
    return operator_count(row, "lEquations", '-')

def lEquations_count_division(row):
    return operator_count(row, "lEquations", '/')

def lEquations_count_equals(row):
    return operator_count(row, "lEquations", '=')

def lEquations_count_parens(row):
    return operator_count(row, "lEquations", '(')


def template_count_multiplication(row):
    return operator_count(row, "Template", '*')

def template_count_addition(row):
    return operator_count(row, "Template", '+')

def template_count_subtraction(row):
    return operator_count(row, "Template", '-')

def template_count_division(row):
    return operator_count(row, "Template", '/')

def template_count_equals(row):
    return operator_count(row, "Template", '=')


In [None]:
import re
import numpy as np

# Checks to see if a particular string can be considered an unknown.
# If it's a number, it's not an unknown. Otherwise, it is
def is_unknown(x):
    return x.isnumeric() == False

# Checks to see if the string x is not empty
def is_not_empty(x):
    return len(x) > 0

# Calculate the number of unknowns in the string
def count_number_of_unknowns(row, column):
    equations = row[column]
    count = 0
    all_unknowns = []
    for equation in equations:
        unknowns = re.split(r'[+-/*()=]+', equation)
        unknowns = [s.strip() for s in unknowns]
        unknowns = list(filter(is_unknown, unknowns))
        unknowns = list(filter(is_not_empty, unknowns))
        all_unknowns.extend(unknowns)

    all_unknowns = np.unique(all_unknowns)
    return len(all_unknowns)

# Count the number of unknowns in lEquations
def lEquations_count_number_of_unknowns(row):
    return count_number_of_unknowns(row, "lEquations");

# Counts the number of unknowns in Template
def template_count_number_of_unknowns(row):
    return count_number_of_unknowns(row, "Template");

In [None]:
# Extract various attributes from lEquations
data["lEquations_num_of_equations"] = data["lEquations"].str.len()
data["lEquations_num_of_additions"] = data.apply(lEquations_count_addition, axis = 1)
data["lEquations_num_of_subtractions"] = data.apply(lEquations_count_subtraction, axis = 1)
data["lEquations_num_of_multiplications"] = data.apply(lEquations_count_multiplication, axis = 1)
data["lEquations_num_of_divisions"] = data.apply(lEquations_count_division, axis = 1)
data["lEquations_num_of_parens"] = data.apply(lEquations_count_parens, axis = 1)
data["lEquations_num_of_unknowns"] = data.apply(lEquations_count_number_of_unknowns, axis = 1)

data["lEquations_num_of_subtractions_and_addition"] = data["lEquations_num_of_additions"] + data["lEquations_num_of_subtractions"]
data["lEquations_num_of_division_and_multiplication"] = data["lEquations_num_of_multiplications"] + data["lEquations_num_of_divisions"]

# Extract various attributes from Template
data["template_num_of_equations"] = data["Template"].str.len()
data["template_num_of_additions"] = data.apply(template_count_addition, axis = 1)
data["template_num_of_subtractions"] = data.apply(template_count_subtraction, axis = 1)
data["template_num_of_multiplications"] = data.apply(template_count_multiplication, axis = 1)
data["template_num_of_divisions"] = data.apply(template_count_division, axis = 1)
data["template_num_of_unknowns"] = data.apply(template_count_number_of_unknowns, axis = 1)

data["template_num_of_subtractions_and_addition"] = data["template_num_of_additions"] + data["template_num_of_subtractions"]
data["template_num_of_division_and_multiplication"] = data["template_num_of_multiplications"] + data["template_num_of_divisions"]

to_drop = [
    "template_num_of_additions",
    "template_num_of_subtractions",
    "template_num_of_multiplications",
    "template_num_of_divisions",
    "lEquations_num_of_additions",
    "lEquations_num_of_subtractions",
    "lEquations_num_of_multiplications",
    "lEquations_num_of_divisions"
]

data.drop(to_drop, axis=1, inplace=True)
data

### 3. Save data to a csv file

In [None]:
data.to_json(OUTPUT_FILE_PATH)