In [28]:
import pandas as pd
import numpy as np
from patsy import ModelDesc, dmatrices, dmatrix, demo_data
import re
import pprint
import json

In [67]:
df = pd.read_csv('winequality-white.csv', delimiter=';')
x = df.drop(['quality'], axis=1)
x.columns = x.columns.str.replace(' ','_')
x.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [69]:
def get_formula_variables(formula: str):
  '''
  Returns a list of every variable (non repeated) from the formula
  '''
  return sorted(list(set("".join(re.findall("[a-zA-Z]+", formula)))))

def group_columns(formula: str, data: pd.DataFrame):
  # get number of variables inside formula
  # convert string to set that only holds unique elements
  characters = get_formula_variables(formula=formula)
  
  # get dataset number of columns
  columns = len(data.columns)
  columns_lst = list(data.columns)
  characters_len = len(characters)

  result = []
  
  # column by column
  for i in range(0, columns):  
    # current column + 1 and substract 1 from characters so we don't count current character
    for j in range(i+1, columns, characters_len-1):
      column_variables = [columns_lst[i]]
      column_variables.extend(columns_lst[j:j+(characters_len-1)])
      # compare numbers and group columns by number of variables inside the formula
      if(len(column_variables) == characters_len):
        result.append(column_variables)
  return result # grouped columns

formula = '(a * b) / c'
grouped_columns = group_columns(formula, x)
# grouped_columns

In [71]:
def get_formula_by_columns(formula: str, columns: list) -> dict:
  '''
  Mapping every single formula's variable to a column.
  '''
  to_replace = {}

  # formula variables
  variables = get_formula_variables(formula=formula)
  # iterate over grouped columns
  for cidx, column_group in enumerate(columns):
    formula_grouped = {}
    # iterate over variables
    for idx, variable in enumerate(variables):
      # variable paired to column name
      formula_grouped[variable] = column_group[idx]
    # every column group represents a key
    to_replace[cidx] = formula_grouped
  return to_replace

replaceable_result = get_formula_by_columns(formula, grouped_columns)
# replaceable_result

In [73]:
def parse_formula(formula: str, formula_columns: dict) -> list:
  '''
  Parses, effectively, every grouped column to a real formula. 
  In simple words, replaces every formula variable for its paired column.
  '''
  result = []

  for key, columns in formula_columns.items():
    new_formula = ""
    for element in formula:
      if element in columns:
        new_formula += columns[element]
      else:
        new_formula += element
    result.append(new_formula)
  
  return result

executable_formulas = parse_formula(formula, replaceable_result)
# executable_formulas

In [83]:
def execute_formula(formula_by_columns: list, data: pd.DataFrame) -> pd.DataFrame:
  '''
  Take every real formula and executes it via patsy dmatrix.
  Saves every formula result inside a new dataframe's column.
  '''
  new_df = data.copy()
  for formula_columns in formula_by_columns:
    formula = "I("+formula_columns+")-1"
    result = dmatrix(formula, data)
    result_items = []
    for item in result:
      result_items.append(item.item())
    new_df[formula_columns] = result_items

  return new_df

pd.set_option('display.max_rows', None)
new_data = execute_formula(formula_by_columns=executable_formulas, data=x)
new_data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,...,(residual_sugar * pH) / sulphates,(chlorides * free_sulfur_dioxide) / total_sulfur_dioxide,(chlorides * density) / pH,(chlorides * sulphates) / alcohol,(free_sulfur_dioxide * total_sulfur_dioxide) / density,(free_sulfur_dioxide * pH) / sulphates,(total_sulfur_dioxide * density) / pH,(total_sulfur_dioxide * sulphates) / alcohol,(density * pH) / sulphates,(pH * sulphates) / alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,...,138.0,0.011912,0.015015,0.002301,7642.357642,300.0,56.723333,8.693182,6.673333,0.153409
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,...,10.77551,0.005197,0.014759,0.002527,1859.15493,94.285714,39.76,6.808421,6.694286,0.170211
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,...,51.122727,0.015464,0.015262,0.002178,2924.329213,222.272727,29.608804,4.225743,7.372786,0.14202
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,...,67.7875,0.014656,0.018102,0.002343,8780.634793,374.825,58.050658,7.515152,7.93991,0.128889
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,...,67.7875,0.014656,0.018102,0.002343,8780.634793,374.825,58.050658,7.515152,7.93991,0.128889
