## 1. Execute imports

In [1]:
import pandas as pd
import numpy as np
from patsy import ModelDesc, dmatrices, dmatrix, demo_data
import re
import pprint
import json

## 2. Create complex operations dict

In [37]:
# TODO: add more complex operations from numpy
COMPLEX_OPERATIONS = {
    'cos': 'np.cos',
    'tan': 'np.tan',
    'log': 'np.log',
    'log10': 'np.log10',
    'log2': 'np.log2',
    'min': 'np.min',
    'max': 'np.max',
    'pi': 'np.pi'
}

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

## 3. Execute functions

In [36]:
def add_blank_spaces_to_formula(formula: str) -> str:
    new = ''
    for index, element in enumerate(formula):
        next_idx = index + 1
        if next_idx < len(formula):
            if not re.match('\w', formula[index+1]):
                new += element + ' '
            else:
                new += element
        else:
            new += element + ' '
    return new

In [71]:
def clean_formula(formula: str) -> str:
    result = formula
    for operation in COMPLEX_OPERATIONS:
        if(operation in formula):
            result = result.replace(operation, "")
    return result

def get_formula_variables(formula: str):
  '''
  Returns a list of every variable (non repeated) from the formula
  '''
  cleaned_formula = clean_formula(formula)
  return sorted(list(set("".join(re.findall("[a-zA-Z]+", cleaned_formula)))))

def group_columns(formula: str, data: pd.DataFrame):
  # get number of variables inside formula
  # convert string to set that only holds unique elements
  characters = get_formula_variables(formula=formula)

  # get dataset number of columns
  columns = len(data.columns)
  columns_lst = list(data.columns)
  characters_len = len(characters)

  result = []
  
  # column by column
  for i in range(0, columns):  
    # current column + 1 and substract 1 from characters so we don't count current character
    for j in range(i+1, columns, characters_len-1):
      column_variables = [columns_lst[i]]
      column_variables.extend(columns_lst[j:j+(characters_len-1)])
      # compare numbers and group columns by number of variables inside the formula
      if(len(column_variables) == characters_len):
        result.append(column_variables)
  return result # grouped columns

In [14]:
def get_formula_by_columns(formula: str, columns: list) -> dict:
  '''
  Mapping every single formula's variable to a column.
  '''
  to_replace = {}

  # formula variables
  variables = get_formula_variables(formula=formula)
  # iterate over grouped columns
  for cidx, column_group in enumerate(columns):
    formula_grouped = {}
    # iterate over variables
    for idx, variable in enumerate(variables):
      # variable paired to column name
      formula_grouped[variable] = column_group[idx]
    # every column group represents a key
    to_replace[cidx] = formula_grouped
  return to_replace

In [68]:
def parse_formula(formula: str, formula_columns: dict) -> list:
  '''
  Parses, effectively, every grouped column to a real formula. 
  In simple words, replaces every formula variable for its paired column.
  '''
  result = []
  formula_variables = re.findall(r'\w+', formula)

  for variables_paired in formula_columns.values():
        new_formula = formula
        for variable in formula_variables:
            if variable in variables_paired:
                # we need to put a blank space after a single character, 
                # so we can identify it then with the regex
                replace_regex = f'{variable}(?:[^\w\*\\\+\(\)])'
                new_formula = re.sub(replace_regex, variables_paired[variable], new_formula)
#             elif variable in COMPLEX_OPERATIONS:
#                 print(f'Going to replace [{variable} for [{COMPLEX_OPERATIONS[variable]}]')
#                 new_formula = new_formula.replace(variable, COMPLEX_OPERATIONS[variable])
#                 print(f'GOING TO APPEND => [{new_formula}]')
        new_formula = new_formula.replace(" ", "")
        
        
        for key, value in COMPLEX_OPERATIONS.items():
            if key in new_formula:
                new_formula = new_formula.replace(key, value)
        
        result.append(new_formula)
  
  return result

In [44]:
def execute_formula(formula_by_columns: list, data: pd.DataFrame) -> pd.DataFrame:
  '''
  Take every real formula and executes it via patsy dmatrix.
  Saves every formula result inside a new dataframe's column.
  '''
  new_df = data.copy()
     
  for formula_columns in formula_by_columns:
    result_items = []
    add_data = True
#     try:
    formula = "I("+formula_columns+")-1"
    result = dmatrix(formula, data, NA_action='raise')
    for item in result:
        result_items.append(item.item())
#     except:
#         # Ignore Patsy error.
#         add_data = False
        
    if add_data:
        if "np." in formula_columns:
            new_df[formula_columns.replace('np.', '')] = result_items
        else:
            new_df[formula_columns] = result_items
    else:
        print(f"{bcolors.WARNING}Your data has some invalid values. Script will ignore them and their possible result.{bcolors.ENDC}")
        
  return new_df

In [70]:
def execute(formula_input: str, data: pd.DataFrame) -> pd.DataFrame:
    
    formula = add_blank_spaces_to_formula(formula_input.lower())
    grouped_columns = group_columns(formula, data)
    replaceable_result = get_formula_by_columns(formula, grouped_columns)
    executable_formulas = parse_formula(formula, replaceable_result)
    new_data = execute_formula(executable_formulas, data)
    
    return new_data

execute(formula_input="cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))", data=x).head(10)

## 4. Play around
Just execute the function `execute` that receives a formula and a pandas' dataframe

In [19]:
df = pd.read_csv('winequality-white.csv', delimiter=';')
x = df.drop(['quality'], axis=1)
x.columns = x.columns.str.replace(' ','_')

# execute(formula_input="(a*cos(b))/c", data=x).head(10)

In [34]:
x['sulphates'].min()
np.pi

3.141592653589793

In [40]:
# execute(formula_input="(pi*a*(b))/min(c)", data=x).head(10)


Grouped columns => [[['fixed_acidity', 'volatile_acidity'], ['fixed_acidity', 'citric_acid'], ['fixed_acidity', 'residual_sugar'], ['fixed_acidity', 'chlorides'], ['fixed_acidity', 'free_sulfur_dioxide'], ['fixed_acidity', 'total_sulfur_dioxide'], ['fixed_acidity', 'density'], ['fixed_acidity', 'pH'], ['fixed_acidity', 'sulphates'], ['fixed_acidity', 'alcohol'], ['volatile_acidity', 'citric_acid'], ['volatile_acidity', 'residual_sugar'], ['volatile_acidity', 'chlorides'], ['volatile_acidity', 'free_sulfur_dioxide'], ['volatile_acidity', 'total_sulfur_dioxide'], ['volatile_acidity', 'density'], ['volatile_acidity', 'pH'], ['volatile_acidity', 'sulphates'], ['volatile_acidity', 'alcohol'], ['citric_acid', 'residual_sugar'], ['citric_acid', 'chlorides'], ['citric_acid', 'free_sulfur_dioxide'], ['citric_acid', 'total_sulfur_dioxide'], ['citric_acid', 'density'], ['citric_acid', 'pH'], ['citric_acid', 'sulphates'], ['citric_acid', 'alcohol'], ['residual_sugar', 'chlorides'], ['residual_suga

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
5,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6
7,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
8,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
9,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0


In [42]:
df2 = pd.DataFrame(np.array([[0, 2, 3, 6], [-1, -1, 6, 5], [7, 8, 9, 3], [7, 8, 9, 2]]), columns=['c1', 'c2', 'c3', 'c4'])
df2

Unnamed: 0,c1,c2,c3,c4
0,0,2,3,6
1,-1,-1,6,5
2,7,8,9,3
3,7,8,9,2


In [72]:
execute(formula_input="cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))", data=df2).head(10)

GOT FORMULA => [cos(2*pi*(c1-min(c1)+c2-min(c2)))/(max(c1)+max(c2)-min(c1)-min(c2))]
GOT FORMULA COMPLEX=> [np.cos(2*np.pi*(c1-np.min(c1)+c2-np.min(c2)))/(np.max(c1)+np.max(c2)-np.min(c1)-np.min(c2))]





GOT FORMULA => [cos(2*pi*(c1-min(c1)+c3-min(c3)))/(max(c1)+max(c3)-min(c1)-min(c3))]
GOT FORMULA COMPLEX=> [np.cos(2*np.pi*(c1-np.min(c1)+c3-np.min(c3)))/(np.max(c1)+np.max(c3)-np.min(c1)-np.min(c3))]





GOT FORMULA => [cos(2*pi*(c1-min(c1)+c4-min(c4)))/(max(c1)+max(c4)-min(c1)-min(c4))]
GOT FORMULA COMPLEX=> [np.cos(2*np.pi*(c1-np.min(c1)+c4-np.min(c4)))/(np.max(c1)+np.max(c4)-np.min(c1)-np.min(c4))]





GOT FORMULA => [cos(2*pi*(c2-min(c2)+c3-min(c3)))/(max(c2)+max(c3)-min(c2)-min(c3))]
GOT FORMULA COMPLEX=> [np.cos(2*np.pi*(c2-np.min(c2)+c3-np.min(c3)))/(np.max(c2)+np.max(c3)-np.min(c2)-np.min(c3))]





GOT FORMULA => [cos(2*pi*(c2-min(c2)+c4-min(c4)))/(max(c2)+max(c4)-min(c2)-min(c4))]
GOT FORMULA COMPLEX=> [np.cos(2*np.pi*(c2-np.min(c2)+c4-np.min(c4)))/(np.max(c2)+np.max(c4)

Unnamed: 0,c1,c2,c3,c4,cos(2*pi*(c1-min(c1)+c2-min(c2)))/(max(c1)+max(c2)-min(c1)-min(c2)),cos(2*pi*(c1-min(c1)+c3-min(c3)))/(max(c1)+max(c3)-min(c1)-min(c3)),cos(2*pi*(c1-min(c1)+c4-min(c4)))/(max(c1)+max(c4)-min(c1)-min(c4)),cos(2*pi*(c2-min(c2)+c3-min(c3)))/(max(c2)+max(c3)-min(c2)-min(c3)),cos(2*pi*(c2-min(c2)+c4-min(c4)))/(max(c2)+max(c4)-min(c2)-min(c4)),cos(2*pi*(c3-min(c3)+c4-min(c4)))/(max(c3)+max(c4)-min(c3)-min(c4))
0,0,2,3,6,0.058824,0.071429,0.083333,0.066667,0.076923,0.1
1,-1,-1,6,5,0.058824,0.071429,0.083333,0.066667,0.076923,0.1
2,7,8,9,3,0.058824,0.071429,0.083333,0.066667,0.076923,0.1
3,7,8,9,2,0.058824,0.071429,0.083333,0.066667,0.076923,0.1


In [9]:
np.pi

3.141592653589793

In [None]:
dma