## 1. Execute imports

In [66]:
import pandas as pd
import numpy as np
from patsy import ModelDesc, dmatrices, dmatrix, demo_data
import re
import pprint
import json

## 2. Create complex operations dict

In [67]:
# TODO: add more complex operations from numpy
COMPLEX_OPERATIONS = {
    'cos': 'np.cos',
    'tan': 'np.tan',
    'log': 'np.log',
    'log10': 'np.log10',
    'log2': 'np.log2',
    'min': 'np.min',
    'max': 'np.max',
    'pi': 'np.pi'
}

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

## 3. Execute functions

In [73]:
def add_blank_spaces_to_formula(formula: str) -> str:
    new = ''
    for index, element in enumerate(formula):
        next_idx = index + 1
        if next_idx < len(formula):
            if not re.match('\w', formula[index+1]):
                new += element + ' '
            else:
                new += element
        else:
            new += element + ' '
    return new

def matched_words(s, pat):
    pat = r'(\w*%s\w*)' % pat       # Not thrilled about this line
    return re.findall(pat, s)

In [4]:
def clean_formula(formula: str) -> str:
    result = formula
    for operation in COMPLEX_OPERATIONS:
        if(operation in formula):
            result = result.replace(operation, "")
    return result

def get_formula_variables(formula: str):
  '''
  Returns a list of every variable (non repeated) from the formula
  '''
  cleaned_formula = clean_formula(formula)
  return sorted(list(set("".join(re.findall("[a-zA-Z]+", cleaned_formula)))))

def group_columns(formula: str, data: pd.DataFrame):
  # get number of variables inside formula
  # convert string to set that only holds unique elements
  characters = get_formula_variables(formula=formula)

  # get dataset number of columns
  columns = len(data.columns)
  columns_lst = list(data.columns)
  characters_len = len(characters)

  result = []
  
  # column by column
  for i in range(0, columns):  
    # current column + 1 and substract 1 from characters so we don't count current character
    for j in range(i+1, columns, characters_len-1):
      column_variables = [columns_lst[i]]
      column_variables.extend(columns_lst[j:j+(characters_len-1)])
      # compare numbers and group columns by number of variables inside the formula
      if(len(column_variables) == characters_len):
        result.append(column_variables)
  return result # grouped columns

In [5]:
def get_formula_by_columns(formula: str, columns: list) -> dict:
  '''
  Mapping every single formula's variable to a column.
  '''
  to_replace = {}

  # formula variables
  variables = get_formula_variables(formula=formula)
  # iterate over grouped columns
  for cidx, column_group in enumerate(columns):
    formula_grouped = {}
    # iterate over variables
    for idx, variable in enumerate(variables):
      # variable paired to column name
      formula_grouped[variable] = column_group[idx]
    # every column group represents a key
    to_replace[cidx] = formula_grouped
  return to_replace

In [80]:
def parse_formula(formula: str, formula_columns: dict) -> list:
  '''
  Parses, effectively, every grouped column to a real formula. 
  In simple words, replaces every formula variable for its paired column.
  '''
  result = []
  formula_variables = re.findall(r'\w+', formula)

  for variables_paired in formula_columns.values():
        new_formula = formula
        for variable in formula_variables:
            if variable in variables_paired:
                # we need to put a blank space after a single character, 
                # so we can identify it then with the regex
                replace_regex = f'{variable}(?:[^\w\*\\\+\(\)\-])'
                new_formula = re.sub(replace_regex, variables_paired[variable], new_formula)
#             elif variable in COMPLEX_OPERATIONS:
#                 print(f'Going to replace [{variable} for [{COMPLEX_OPERATIONS[variable]}]')
#                 new_formula = new_formula.replace(variable, COMPLEX_OPERATIONS[variable])
#                 print(f'GOING TO APPEND => [{new_formula}]')
        new_formula = new_formula.replace(" ", "")
        for key, value in COMPLEX_OPERATIONS.items():
            if key in new_formula:
                new_formula = new_formula.replace(key, value)
        
        result.append(new_formula)
  
  return result

In [7]:
def execute_formula(formula_by_columns: list, data: pd.DataFrame) -> pd.DataFrame:
  '''
  Take every real formula and executes it via patsy dmatrix.
  Saves every formula result inside a new dataframe's column.
  '''
  new_df = data.copy()
     
  for formula_columns in formula_by_columns:
    result_items = []
    add_data = True
#     try:
    formula = "I("+formula_columns+")-1"
    result = dmatrix(formula, data, NA_action='raise')
    for item in result:
        result_items.append(item.item())
#     except:
#         # Ignore Patsy error.
#         add_data = False
        
    if add_data:
        if "np." in formula_columns:
            new_df[formula_columns.replace('np.', '')] = result_items
        else:
            new_df[formula_columns] = result_items
    else:
        print(f"{bcolors.WARNING}Your data has some invalid values. Script will ignore them and their possible result.{bcolors.ENDC}")
        
  return new_df

In [60]:
def execute(formula_input: str, data: pd.DataFrame) -> pd.DataFrame:
    
    formula = add_blank_spaces_to_formula(formula_input.lower())
    grouped_columns = group_columns(formula, data)
    replaceable_result = get_formula_by_columns(formula, grouped_columns)
    print(f'Got formula => {formula}')
    executable_formulas = parse_formula(formula, replaceable_result)
    new_data = execute_formula(executable_formulas, data)
    
    return new_data

## 4. Stage 2

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [81]:
data = pd.read_csv("./datasets/diabetes.csv")
data.isnull().sum().sum()
data

Unnamed: 0,Pregnancies,Glucoze,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [82]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']
# X.columns = X.columns.str.replace(' ','_')

### Formulas

In [83]:
formula_a = "cos(a+b)"
formula_b = "cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))"
formula_c = "a*b"
name = 'diabetes'

In [84]:
X_new = execute(formula_input=formula_a, data=X)
X_new.to_csv(f'./datasets/{name}_formula_a.csv', index=False)

Got formula => cos (a +b ) 
executable_formulas => ['np.cos(Pregnancies+Glucoze)', 'np.cos(Pregnancies+BloodPressure)', 'np.cos(Pregnancies+SkinThickness)', 'np.cos(Pregnancies+Insulin)', 'np.cos(Pregnancies+BMI)', 'np.cos(Pregnancies+DiabetesPedigreeFunction)', 'np.cos(Pregnancies+Age)', 'np.cos(Glucoze+BloodPressure)', 'np.cos(Glucoze+SkinThickness)', 'np.cos(Glucoze+Insulin)', 'np.cos(Glucoze+BMI)', 'np.cos(Glucoze+DiabetesPedigreeFunction)', 'np.cos(Glucoze+Age)', 'np.cos(BloodPressure+SkinThickness)', 'np.cos(BloodPressure+Insulin)', 'np.cos(BloodPressure+BMI)', 'np.cos(BloodPressure+DiabetesPedigreeFunction)', 'np.cos(BloodPressure+Age)', 'np.cos(SkinThickness+Insulin)', 'np.cos(SkinThickness+BMI)', 'np.cos(SkinThickness+DiabetesPedigreeFunction)', 'np.cos(SkinThickness+Age)', 'np.cos(Insulin+BMI)', 'np.cos(Insulin+DiabetesPedigreeFunction)', 'np.cos(Insulin+Age)', 'np.cos(BMI+DiabetesPedigreeFunction)', 'np.cos(BMI+Age)', 'np.cos(DiabetesPedigreeFunction+Age)']


In [85]:
X_new_ = execute(formula_input=formula_b, data=X)
X_new_.to_csv(f'./datasets/{name}_knowledge_formula_b.csv', index=False)

Got formula => cos (2 *pi * (a -min (a ) +b -min (b ) ) ) / (max (a ) +max (b ) -min (a ) -min (b ) ) 
executable_formulas => ['np.cos(2*np.pi*(Pregnancies-np.min(Pregnancies)+Glucoze-np.min(Glucoze)))/(np.max(Pregnancies)+np.max(Glucoze)-np.min(Pregnancies)-np.min(Glucoze))', 'np.cos(2*np.pi*(Pregnancies-np.min(Pregnancies)+BloodPressure-np.min(BloodPressure)))/(np.max(Pregnancies)+np.max(BloodPressure)-np.min(Pregnancies)-np.min(BloodPressure))', 'np.cos(2*np.pi*(Pregnancies-np.min(Pregnancies)+SkinThickness-np.min(SkinThickness)))/(np.max(Pregnancies)+np.max(SkinThickness)-np.min(Pregnancies)-np.min(SkinThickness))', 'np.cos(2*np.pi*(Pregnancies-np.min(Pregnancies)+Insulin-np.min(Insulin)))/(np.max(Pregnancies)+np.max(Insulin)-np.min(Pregnancies)-np.min(Insulin))', 'np.cos(2*np.pi*(Pregnancies-np.min(Pregnancies)+BMI-np.min(BMI)))/(np.max(Pregnancies)+np.max(BMI)-np.min(Pregnancies)-np.min(BMI))', 'np.cos(2*np.pi*(Pregnancies-np.min(Pregnancies)+DiabetesPedigreeFunction-np.min(Diabe

In [86]:
X_new_2 = execute(formula_input=formula_c, data=X)
X_new_2.to_csv(f'./datasets/{name}_knowledge_formula_c.csv', index=False)

Got formula => a *b 
executable_formulas => ['Pregnancies*Glucoze', 'Pregnancies*BloodPressure', 'Pregnancies*SkinThickness', 'Pregnancies*Insulin', 'Pregnancies*BMI', 'Pregnancies*DiabetesPedigreeFunction', 'Pregnancies*Age', 'Glucoze*BloodPressure', 'Glucoze*SkinThickness', 'Glucoze*Insulin', 'Glucoze*BMI', 'Glucoze*DiabetesPedigreeFunction', 'Glucoze*Age', 'BloodPressure*SkinThickness', 'BloodPressure*Insulin', 'BloodPressure*BMI', 'BloodPressure*DiabetesPedigreeFunction', 'BloodPressure*Age', 'SkinThickness*Insulin', 'SkinThickness*BMI', 'SkinThickness*DiabetesPedigreeFunction', 'SkinThickness*Age', 'Insulin*BMI', 'Insulin*DiabetesPedigreeFunction', 'Insulin*Age', 'BMI*DiabetesPedigreeFunction', 'BMI*Age', 'DiabetesPedigreeFunction*Age']


# Stage 2

In [35]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size = 0.2, random_state = 42)

## KNN

In [36]:
reg_knn = KNeighborsClassifier()
reg_knn.fit(X_train, y_train)
y_pred = reg_knn.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print("####### CROSS VAL SCORE #######")
print(cross_val_score(reg_knn, X_full, y, cv=10).mean())

              precision    recall  f1-score   support

           1       0.67      0.80      0.73         5
           2       0.86      0.75      0.80         8
           3       0.00      0.00      0.00         0
           5       1.00      1.00      1.00         2
           6       0.50      0.33      0.40         3
           7       0.75      0.75      0.75         4

    accuracy                           0.73        22
   macro avg       0.63      0.61      0.61        22
weighted avg       0.76      0.73      0.74        22

####### CROSS VAL SCORE #######
0.6363636363636365


## Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
reg_log = LogisticRegression()
reg_log.fit(X_train, y_train)
y_pred = reg_log.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print("####### CROSS VAL SCORE #######")
print(cross_val_score(reg_log, X_full, y, cv=10).mean())

              precision    recall  f1-score   support

           1       0.80      0.80      0.80         5
           2       0.78      0.88      0.82         8
           5       1.00      1.00      1.00         2
           6       1.00      0.67      0.80         3
           7       0.75      0.75      0.75         4

    accuracy                           0.82        22
   macro avg       0.87      0.82      0.83        22
weighted avg       0.83      0.82      0.82        22

####### CROSS VAL SCORE #######
0.6021645021645022


## SVM

In [28]:
from sklearn.svm import SVC

In [29]:
reg_svc = SVC(kernel='linear')
reg_svc.fit(X_train, y_train)
y_pred = reg_svc.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("####### CROSS VAL SCORE #######")
print(cross_val_score(reg_svc, X_full, y, cv=10).mean())

              precision    recall  f1-score   support

           1       0.60      0.82      0.69        11
           2       0.92      0.79      0.85        14
           3       0.00      0.00      0.00         3
           5       0.80      1.00      0.89         4
           6       1.00      0.67      0.80         3
           7       1.00      1.00      1.00         8

    accuracy                           0.79        43
   macro avg       0.72      0.71      0.70        43
weighted avg       0.78      0.79      0.78        43

####### CROSS VAL SCORE #######
0.6649350649350649


## Naive Bayes

In [30]:
from sklearn.naive_bayes import GaussianNB

In [31]:
reg_rf = GaussianNB()
reg_rf.fit(X_train, y_train)
y_pred = reg_rf.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("####### CROSS VAL SCORE #######")
print(cross_val_score(reg_rf, X_full, y, cv=10).mean())

              precision    recall  f1-score   support

           1       0.47      0.64      0.54        11
           2       0.80      0.29      0.42        14
           3       0.50      1.00      0.67         3
           5       0.80      1.00      0.89         4
           6       1.00      0.67      0.80         3
           7       0.80      1.00      0.89         8

    accuracy                           0.65        43
   macro avg       0.73      0.76      0.70        43
weighted avg       0.71      0.65      0.63        43

####### CROSS VAL SCORE #######
0.42034632034632036


In [72]:
import re



matched_words("cos(Pregnancies+Glucose)", "cos")

['cos', 'Glucose']