In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


#Models
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import confusion_matrix, classification_report

import random
from sklearn.model_selection import KFold
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from random import randrange

In [9]:
import pandas as pd
import numpy as np
from patsy import ModelDesc, dmatrices, dmatrix, demo_data
import re
import pprint
import json

# TODO: add more complex operations from numpy
COMPLEX_OPERATIONS = {
    'cos': 'np.cos',
    'tan': 'np.tan',
    'log': 'np.log',
    'log10': 'np.log10',
    'log2': 'np.log2',
    'min': 'np.min',
    'max': 'np.max',
    'pi': 'np.pi'
}

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

def clean_column_symbols(column_, df, result):
    df[column_.replace('np.cos', '_COZ_')
       .replace(')', 'PAR_C')
       .replace('(', 'PAR_O')
       .replace('np.min', '_MINIM_')
       .replace('np.max', '_MAXIM_')
       .replace('np.pi', '_PII_')
       .replace('**', '_POW_')
       .replace('+', '_PLUS_')
       .replace('*', '_TIMES_')
       .replace('-', '_MINUS_')
      .replace('/', '_DIV_')] = result
    
def add_blank_spaces_to_formula(formula: str) -> str:
    new = ''
    for index, element in enumerate(formula):
        next_idx = index + 1
        if next_idx < len(formula):
            if not re.match('\w', formula[index+1]):
                new += element + ' '
            else:
                new += element
        else:
            new += element + ' '
    return new

def matched_words(s, pat):
    pat = r'(\w*%s\w*)' % pat       # Not thrilled about this line
    return re.findall(pat, s)

def clean_formula(formula: str) -> str:
    result = formula
    for operation in COMPLEX_OPERATIONS:
        if(operation in formula):
            result = result.replace(operation, "")
    return result

def get_formula_variables(formula: str):
  '''
  Returns a list of every variable (non repeated) from the formula
  '''
  cleaned_formula = clean_formula(formula)
  return sorted(list(set("".join(re.findall("[a-zA-Z]+", cleaned_formula)))))

def group_columns(formula: str, data: pd.DataFrame):
  # get number of variables inside formula
  # convert string to set that only holds unique elements
  characters = get_formula_variables(formula=formula)

  # get dataset number of columns
  columns = len(data.columns)
  columns_lst = list(data.columns)
  characters_len = len(characters)

  result = []
  
  # column by column
  for i in range(0, columns):  
    # current column + 1 and substract 1 from characters so we don't count current character
    for j in range(i+1, columns, characters_len-1):
      column_variables = [columns_lst[i]]
      column_variables.extend(columns_lst[j:j+(characters_len-1)])
      # compare numbers and group columns by number of variables inside the formula
      if(len(column_variables) == characters_len):
        result.append(column_variables)
  return result # grouped columns

def get_formula_by_columns(formula: str, columns: list) -> dict:
  '''
  Mapping every single formula's variable to a column.
  '''
  to_replace = {}

  # formula variables
  variables = get_formula_variables(formula=formula)
  # iterate over grouped columns
  for cidx, column_group in enumerate(columns):
    formula_grouped = {}
    # iterate over variables
    for idx, variable in enumerate(variables):
      # variable paired to column name
      formula_grouped[variable] = column_group[idx]
    # every column group represents a key
    to_replace[cidx] = formula_grouped
  return to_replace

def parse_formula(formula: str, formula_columns: dict) -> list:
  '''
  Parses, effectively, every grouped column to a real formula. 
  In simple words, replaces every formula variable for its paired column.
  '''
  result = []
  formula_variables = re.findall(r'\w+', formula)

  for variables_paired in formula_columns.values():
        new_formula = formula
        for variable in formula_variables:
            if variable in variables_paired:
                # we need to put a blank space after a single character, 
                # so we can identify it then with the regex
                replace_regex = f'{variable}(?:[^\w\*\\\+\(\)\-])'
                new_formula = re.sub(replace_regex, variables_paired[variable], new_formula)
#             elif variable in COMPLEX_OPERATIONS:
#                 print(f'Going to replace [{variable} for [{COMPLEX_OPERATIONS[variable]}]')
#                 new_formula = new_formula.replace(variable, COMPLEX_OPERATIONS[variable])
#                 print(f'GOING TO APPEND => [{new_formula}]')
        new_formula = new_formula.replace(" ", "")
        for key, value in COMPLEX_OPERATIONS.items():
            if key in new_formula:
                new_formula = new_formula.replace(key, value)
        
        result.append(new_formula)
  
  return result

def execute_formula(formula_by_columns: list, data: pd.DataFrame) -> pd.DataFrame:
  '''
  Take every real formula and executes it via patsy dmatrix.
  Saves every formula result inside a new dataframe's column.
  '''
  new_df = data.copy()
     
  for formula_columns in formula_by_columns:
    result_items = []
    add_data = True
#     try:
    formula = "I("+formula_columns+")-1"
    result = dmatrix(formula, data, NA_action='raise')
    for item in result:
        result_items.append(item.item())
#     except:
#         # Ignore Patsy error.
#         add_data = False
        
    if add_data:
        clean_column_symbols(formula_columns, new_df, result_items)
    else:
        print_error("Your data has some invalid values. Script will ignore them and their possible result")
        
  return new_df

def execute(formula_input: str, data: pd.DataFrame, class_column: str = None):

    class_column_values = None
    if class_column is not None:
        class_column_values = data[class_column]
        data=data.drop(class_column, axis=1)
    
    data.columns = data.columns.str.replace(' ','_')
    
    formula = add_blank_spaces_to_formula(formula_input.lower())
    grouped_columns = group_columns(formula, data)
    replaceable_result = get_formula_by_columns(formula, grouped_columns)
    
#     print(f'Got formula => {formula}')
    executable_formulas = parse_formula(formula, replaceable_result)
    new_data = execute_formula(executable_formulas, data)

    if class_column_values is None:
        return new_data
    else:
        return new_data, class_column_values
def print_error(error_message: str):
    print(f"{bcolors.WARNING}{error_message}{bcolors.ENDC}")

## Iteration

In [84]:
def get_model_for_selection(model: str):
    models = {
        'KNN': KNeighborsClassifier(),
        'LR': LogisticRegression(),
        'NB': GaussianNB(),
        'SVM': svm.SVC(max_iter=500000),
        'RF': RandomForestClassifier(random_state=0),
        'MLP': MLPClassifier(random_state=1)
    }
    model_ = models.get(model.upper())
#     print(f'Going to return model: [{type(model_)}]')
    return model_

In [121]:
def iterate_data(data: pd.DataFrame, 
                 class_name: str,
                 formula_array: list,
                 model: str, 
                 validation_data_size: float, 
                 test_data_size: float,
                 dataset_name: str,
                 forward_selection = True,
                 use_cross_val = False,
                 verbose_mode = False):
    
    if len(formula_array) == 0:
       print_error("Formula array can't be empty.")
       return
    
    # variable definitions
    continue_iter = True
    idx_iter = 0
    formula_len = len(formula_array)
    last_formula_idx = 0
    class_df = {}

    iterations_result = []
    iterations_df = []
    
    # this variables will handle the dataframe and classes
    X = {}
    y = {}

    # the dataframe with its column filtered just for those that has been selected in the iteration n-1
    last_selected_X = {}
    
    while continue_iter:
        
        formula = ""
        
        if idx_iter < formula_len:
            formula = formula_array[idx_iter]
            last_formula_idx += 1
        elif (last_formula_idx+1) < formula_len:
            formula = formula_array[last_formula_idx + 1]
            last_formula_idx += 1
        else:
            last_formula_idx = 0
            formula = formula_array[last_formula_idx]
        
        if verbose_mode:
            print(f'ITERATION {idx_iter} with formula [{formula}]')
        # for the first iteration, we need to create data from the original dataset
        if idx_iter == 0:
            new_df = df.sample(frac=1)
            X, y = execute(formula_input=formula, data=new_df, class_column=class_name)
        else:
            X = execute(formula_input=formula, data=last_selected_X)

        if verbose_mode:
            print(f'Feature Generation - Columns => {list(X.columns)}')

        number_of_columns = len(X.columns)

        #split the dataset in 3 parts: train, evaluation and test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_data_size, 
                                                            random_state=1)

        # split the dataset in train and validation
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_data_size,
                                                          random_state=1) # 0.25 x 0.8 = 0.2

        random_n = random.randint(2, number_of_columns)
        if random_n > 10:
            random_n = 10
        model_selected = get_model_for_selection(model)
        ## FEATURE SELECTION SECTION
        plain_sfs = SFS(model_selected,
                        scoring='accuracy',
                        cv=0,
                        k_features=random_n, 
                        forward=forward_selection, 
                        floating=(not forward_selection), n_jobs=-1)

        # train
        plain_sfs.fit(X_train, y_train)
        selected_features = X.columns[list(plain_sfs.k_feature_idx_)]
        if verbose_mode:
            print(f'Selected Features => {list(selected_features)}')

        clf = get_model_for_selection(model)
        # validation
        clf.fit(X_val[selected_features], y_val)

        current_score = 0
        # we get score using the test.
        if use_cross_val:
            current_score = cross_val_score(clf, X_test, y_test, cv=5).mean()
        else:
            current_score = accuracy_score(y_test, clf.predict(X_test[selected_features]))
        if verbose_mode:
            print(f'Got score = {current_score}')
            
        last_selected_X = X[selected_features]

        # save both score and df with selected features to a list
        iterations_result.append(current_score)
        iterations_df.append(last_selected_X)

        if idx_iter > 0 and current_score <= iterations_result[idx_iter-1]:
            continue_iter = False
        else:
            continue_iter = True
        idx_iter += 1
#         print("")
        
        max_index = iterations_result.index(max(iterations_result))

    selection_type = "Backward"
    if forward_selection:
        selection_type = "Forward"
        
    print(f"{dataset_name} - {selection_type} Selection - {model.upper()} - Formula {formula_array}")
    
    if verbose_mode:
        print('**** RESULTS ****')
        print(f'Iteration with the best result: {max_index}')
        print(f'Features for the best result: {clean_column_names(list(iterations_df[max_index].columns))}')
    print(f'Score for the best result: {round(iterations_result[max_index], 2)}')
    print("")
    
def clean_column_names(names: list):
    result = []
    for column in names:
        result.append(column.replace("_POW_", " ^ ")
                      .replace("_TIMES_", "x")
                      .replace("_PLUS_", "+")
                      .replace("_MINUS_", "-")
                      .replace("_COZ_", "cos")
                      .replace("PAR_C", ")")
                      .replace("PAR_O", "(")
                      .replace('_MINIM_', 'min')
                      .replace('_MAXIM_', 'max')
                      .replace('_PII_', 'pi')
                     .replace('_DIV_', '/'))
    return result

In [None]:
%%time
df = pd.read_csv('./datasets/Algerian_forest_fires.csv')

# banknote => Class
# speaker => language
# algerian => Classes
class_name = 'Classes'

ds_name = 'Algerian Forest'
# ds_name = 'Banknote Authentication'
# ds_name = 'Speaker Accent Recognition'
# ds_name = 'Pima Indian Diabetes'

## F1
# formulas = ["a * b"]
##F2
# formulas = ["cos(a+b)"]
##F3
# formulas = ["cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))"]
## F12
# formulas = ["a * b", "cos(a+b)"]
## F13
# formulas = ["a * b", "cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))"]
## F123
# formulas = ["a * b", "cos(a+b)", "cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))"]

selection_model = 'SVM'
forward = True

use_cross_validation = True

formulas_list = [["a * b"], ["cos(a+b)"], ["cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))"],
                 ["a * b", "cos(a+b)"], ["a * b", "cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))"],
                 ["a * b", "cos(a+b)", "cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))"]]

for form in formulas_list:
    iterate_data(data=df, class_name=class_name, formula_array=form,
             model=selection_model, validation_data_size=0.10, test_data_size=0.10, 
             forward_selection=forward, dataset_name=ds_name, use_cross_val=use_cross_validation,
             verbose_mode = False)

forward = False
for form_ in formulas_list:
    iterate_data(data=df, class_name=class_name, formula_array=form_,
             model=selection_model, validation_data_size=0.10, test_data_size=0.10, 
             forward_selection=forward, dataset_name=ds_name, use_cross_val=use_cross_validation,
             verbose_mode = False)


Algerian Forest - Forward Selection - SVM - Formula ['a * b']
Score for the best result: 0.88

Algerian Forest - Forward Selection - SVM - Formula ['cos(a+b)']
Score for the best result: 0.56

Algerian Forest - Forward Selection - SVM - Formula ['cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))']
Score for the best result: 0.64

Algerian Forest - Forward Selection - SVM - Formula ['a * b', 'cos(a+b)']
Score for the best result: 0.8

Algerian Forest - Forward Selection - SVM - Formula ['a * b', 'cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))']
Score for the best result: 0.64

Algerian Forest - Forward Selection - SVM - Formula ['a * b', 'cos(a+b)', 'cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))']
Score for the best result: 0.92

Algerian Forest - Backward Selection - SVM - Formula ['a * b']
Score for the best result: 0.8



# TODO
- When we use a formula with one character like (a*a) we have problem building the formula
- We need to take care about the naming for the features that are generated by our method because if we use a math notation, we have problems at the next iteration. For instance: we can't create a feature called A1*A2 because in the next iteration, patsy will separate that into two columns and not just one.