In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import random
from sklearn.model_selection import KFold
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from random import randrange

In [40]:
import pandas as pd
import numpy as np
from patsy import ModelDesc, dmatrices, dmatrix, demo_data
import re
import pprint
import json

# TODO: add more complex operations from numpy
COMPLEX_OPERATIONS = {
    'cos': 'np.cos',
    'tan': 'np.tan',
    'log': 'np.log',
    'log10': 'np.log10',
    'log2': 'np.log2',
    'min': 'np.min',
    'max': 'np.max',
    'pi': 'np.pi'
}

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


def add_blank_spaces_to_formula(formula: str) -> str:
    new = ''
    for index, element in enumerate(formula):
        next_idx = index + 1
        if next_idx < len(formula):
            if not re.match('\w', formula[index+1]):
                new += element + ' '
            else:
                new += element
        else:
            new += element + ' '
    return new

def matched_words(s, pat):
    pat = r'(\w*%s\w*)' % pat       # Not thrilled about this line
    return re.findall(pat, s)

def clean_formula(formula: str) -> str:
    result = formula
    for operation in COMPLEX_OPERATIONS:
        if(operation in formula):
            result = result.replace(operation, "")
    return result

def get_formula_variables(formula: str):
  '''
  Returns a list of every variable (non repeated) from the formula
  '''
  cleaned_formula = clean_formula(formula)
  return sorted(list(set("".join(re.findall("[a-zA-Z]+", cleaned_formula)))))

def group_columns(formula: str, data: pd.DataFrame):
  # get number of variables inside formula
  # convert string to set that only holds unique elements
  characters = get_formula_variables(formula=formula)

  # get dataset number of columns
  columns = len(data.columns)
  columns_lst = list(data.columns)
  characters_len = len(characters)

  result = []
  
  # column by column
  for i in range(0, columns):  
    # current column + 1 and substract 1 from characters so we don't count current character
    for j in range(i+1, columns, characters_len-1):
      column_variables = [columns_lst[i]]
      column_variables.extend(columns_lst[j:j+(characters_len-1)])
      # compare numbers and group columns by number of variables inside the formula
      if(len(column_variables) == characters_len):
        result.append(column_variables)
  return result # grouped columns

def get_formula_by_columns(formula: str, columns: list) -> dict:
  '''
  Mapping every single formula's variable to a column.
  '''
  to_replace = {}

  # formula variables
  variables = get_formula_variables(formula=formula)
  # iterate over grouped columns
  for cidx, column_group in enumerate(columns):
    formula_grouped = {}
    # iterate over variables
    for idx, variable in enumerate(variables):
      # variable paired to column name
      formula_grouped[variable] = column_group[idx]
    # every column group represents a key
    to_replace[cidx] = formula_grouped
  return to_replace

def parse_formula(formula: str, formula_columns: dict) -> list:
  '''
  Parses, effectively, every grouped column to a real formula. 
  In simple words, replaces every formula variable for its paired column.
  '''
  result = []
  formula_variables = re.findall(r'\w+', formula)

  for variables_paired in formula_columns.values():
        new_formula = formula
        for variable in formula_variables:
            if variable in variables_paired:
                # we need to put a blank space after a single character, 
                # so we can identify it then with the regex
                replace_regex = f'{variable}(?:[^\w\*\\\+\(\)\-])'
                new_formula = re.sub(replace_regex, variables_paired[variable], new_formula)
#             elif variable in COMPLEX_OPERATIONS:
#                 print(f'Going to replace [{variable} for [{COMPLEX_OPERATIONS[variable]}]')
#                 new_formula = new_formula.replace(variable, COMPLEX_OPERATIONS[variable])
#                 print(f'GOING TO APPEND => [{new_formula}]')
        new_formula = new_formula.replace(" ", "")
        for key, value in COMPLEX_OPERATIONS.items():
            if key in new_formula:
                new_formula = new_formula.replace(key, value)
        
        result.append(new_formula)
  
  return result

def execute_formula(formula_by_columns: list, data: pd.DataFrame) -> pd.DataFrame:
  '''
  Take every real formula and executes it via patsy dmatrix.
  Saves every formula result inside a new dataframe's column.
  '''
  new_df = data.copy()
     
  for formula_columns in formula_by_columns:
    result_items = []
    add_data = True
#     try:
    formula = "I("+formula_columns+")-1"
    result = dmatrix(formula, data, NA_action='raise')
    for item in result:
        result_items.append(item.item())
#     except:
#         # Ignore Patsy error.
#         add_data = False
        
    if add_data:
        if "np." in formula_columns:
            new_df[formula_columns.replace('np.', '')] = result_items
        else:
            new_df[formula_columns.replace('*', 'x')] = result_items
    else:
        print(f"{bcolors.WARNING}Your data has some invalid values. Script will ignore them and their possible result.{bcolors.ENDC}")
        
  return new_df

def execute(formula_input: str, data: pd.DataFrame, class_column: str = None):

    class_column_values = None
    if class_column is not None:
        class_column_values = data[class_column]
        data=data.drop(class_column, axis=1)
    
    data.columns = data.columns.str.replace(' ','_')
    
    formula = add_blank_spaces_to_formula(formula_input.lower())
    grouped_columns = group_columns(formula, data)
    replaceable_result = get_formula_by_columns(formula, grouped_columns)
    
#     print(f'Got formula => {formula}')
    executable_formulas = parse_formula(formula, replaceable_result)
    new_data = execute_formula(executable_formulas, data)

    if class_column_values is None:
        return new_data
    else:
        return new_data, class_column_values

## Variables definition

In [75]:
continue_iter = True
idx_iter = 0
df = pd.read_csv('./datasets/user_knowledge.csv')
formula = 'a* b'
class_name = 'UNS'
class_df = {}

iterations_result = []
iterations_df = []

X = df
# we're going to save here our dataframe class.
y = {}

# the dataframe with its column filtered just for those that has been selected in the iteration n-1
last_selected_X = {}

## Iteration

In [76]:
while continue_iter:
    print(f'ITERATION {idx_iter}')
    # for the first iteration, we need to create data from the original dataset
    if idx_iter == 0:
        X, y = execute(formula_input=formula, data=df, class_column=class_name)
    else:
        X = execute(formula_input=formula, data=last_selected_X)
    
    print(f'Feature Generation - Columns => {list(X.columns)}')
        
    number_of_columns = len(X.columns)
    
    #split the dataset in 3 parts: train, evaluation and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=1) # 0.25 x 0.8 = 0.2

    random_n = random.randint(2, number_of_columns)
    if random_n > 10:
        random_n = 10
#     print(f'Got random_n => [{random_n}]')
    plain_sfs = SFS(KNeighborsClassifier(), 
          k_features=random_n, 
          forward=True, 
          floating=False, n_jobs=-1)
    # train
    plain_sfs.fit(X_train, y_train)
    selected_features = X.columns[list(plain_sfs.k_feature_idx_)]
    print(f'Selected Features => {list(selected_features)}')
    
    clf = KNeighborsClassifier()
    # validation
    clf.fit(X_val[selected_features], y_val)
    
    # we get score using the test.
    current_score = accuracy_score(y_test, clf.predict(X_test[selected_features]))
    print(f'Got score = {current_score}')
    last_selected_X = X[selected_features]
    
    # save both score and df with selected features to a list
    iterations_result.append(current_score)
    iterations_df.append(last_selected_X)
    
    if idx_iter > 0 and current_score <= iterations_result[idx_iter-1]:
        continue_iter = False
    else:
        continue_iter = True
    idx_iter += 1
    print("")
        

max_index = iterations_result.index(max(iterations_result))
print("")
print('**** RESULTS ****')
print(f'Iteration with the best result: {max_index}')
print(f'Features for the best result: {list(iterations_df[max_index].columns)}')
print(f'Score for the best result: {iterations_result[max_index]}')


ITERATION 0
Feature Generation - Columns => ['STG', 'SCG', 'STR', 'LPR', 'PEG', 'STGxSCG', 'STGxSTR', 'STGxLPR', 'STGxPEG', 'SCGxSTR', 'SCGxLPR', 'SCGxPEG', 'STRxLPR', 'STRxPEG', 'LPRxPEG']
Selected Features => ['STR', 'LPR', 'PEG', 'STGxSCG', 'STGxSTR', 'STGxLPR', 'STGxPEG', 'SCGxPEG', 'STRxPEG', 'LPRxPEG']
Got score = 0.8024691358024691

ITERATION 1
Feature Generation - Columns => ['STR', 'LPR', 'PEG', 'STGxSCG', 'STGxSTR', 'STGxLPR', 'STGxPEG', 'SCGxPEG', 'STRxPEG', 'LPRxPEG', 'STRxLPR', 'STRxSTGxSCG', 'STRxSTGxSTR', 'STRxSTGxLPR', 'STRxSTGxPEG', 'STRxSCGxPEG', 'STRxSTRxPEG', 'STRxLPRxPEG', 'LPRxSTGxSCG', 'LPRxSTGxSTR', 'LPRxSTGxLPR', 'LPRxSTGxPEG', 'LPRxSCGxPEG', 'LPRxSTRxPEG', 'LPRxLPRxPEG', 'PEGxSTGxSCG', 'PEGxSTGxSTR', 'PEGxSTGxLPR', 'PEGxSTGxPEG', 'PEGxSCGxPEG', 'PEGxSTRxPEG', 'PEGxLPRxPEG', 'STGxSCGxSTGxSTR', 'STGxSCGxSTGxLPR', 'STGxSCGxSTGxPEG', 'STGxSCGxSCGxPEG', 'STGxSCGxSTRxPEG', 'STGxSCGxLPRxPEG', 'STGxSTRxSTGxLPR', 'STGxSTRxSTGxPEG', 'STGxSTRxSCGxPEG', 'STGxSTRxSTRxPEG',