## 1. Execute imports

In [39]:
import pandas as pd
import numpy as np
from patsy import ModelDesc, dmatrices, dmatrix, demo_data
import re
import pprint
import json

## 2. Create complex operations dict

In [40]:
# TODO: add more complex operations from numpy
COMPLEX_OPERATIONS = {
    'cos': 'np.cos',
    'tan': 'np.tan',
    'log': 'np.log',
    'log10': 'np.log10',
    'log2': 'np.log2',
    'min': 'np.min',
    'max': 'np.max',
    'pi': 'np.pi'
}

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

## 3. Execute functions

In [41]:
def add_blank_spaces_to_formula(formula: str) -> str:
    new = ''
    for index, element in enumerate(formula):
        next_idx = index + 1
        if next_idx < len(formula):
            if not re.match('\w', formula[index+1]):
                new += element + ' '
            else:
                new += element
        else:
            new += element + ' '
    return new

In [42]:
def clean_formula(formula: str) -> str:
    result = formula
    for operation in COMPLEX_OPERATIONS:
        if(operation in formula):
            result = result.replace(operation, "")
    return result

def get_formula_variables(formula: str):
  '''
  Returns a list of every variable (non repeated) from the formula
  '''
  cleaned_formula = clean_formula(formula)
  return sorted(list(set("".join(re.findall("[a-zA-Z]+", cleaned_formula)))))

def group_columns(formula: str, data: pd.DataFrame):
  # get number of variables inside formula
  # convert string to set that only holds unique elements
  characters = get_formula_variables(formula=formula)

  # get dataset number of columns
  columns = len(data.columns)
  columns_lst = list(data.columns)
  characters_len = len(characters)

  result = []
  
  # column by column
  for i in range(0, columns):  
    # current column + 1 and substract 1 from characters so we don't count current character
    for j in range(i+1, columns, characters_len-1):
      column_variables = [columns_lst[i]]
      column_variables.extend(columns_lst[j:j+(characters_len-1)])
      # compare numbers and group columns by number of variables inside the formula
      if(len(column_variables) == characters_len):
        result.append(column_variables)
  return result # grouped columns

In [43]:
def get_formula_by_columns(formula: str, columns: list) -> dict:
  '''
  Mapping every single formula's variable to a column.
  '''
  to_replace = {}

  # formula variables
  variables = get_formula_variables(formula=formula)
  # iterate over grouped columns
  for cidx, column_group in enumerate(columns):
    formula_grouped = {}
    # iterate over variables
    for idx, variable in enumerate(variables):
      # variable paired to column name
      formula_grouped[variable] = column_group[idx]
    # every column group represents a key
    to_replace[cidx] = formula_grouped
  return to_replace

In [44]:
def parse_formula(formula: str, formula_columns: dict) -> list:
  '''
  Parses, effectively, every grouped column to a real formula. 
  In simple words, replaces every formula variable for its paired column.
  '''
  result = []
  formula_variables = re.findall(r'\w+', formula)

  for variables_paired in formula_columns.values():
        new_formula = formula
        for variable in formula_variables:
            if variable in variables_paired:
                # we need to put a blank space after a single character, 
                # so we can identify it then with the regex
                replace_regex = f'{variable}(?:[^\w\*\\\+\(\)\-])'
                new_formula = re.sub(replace_regex, variables_paired[variable], new_formula)
#             elif variable in COMPLEX_OPERATIONS:
#                 print(f'Going to replace [{variable} for [{COMPLEX_OPERATIONS[variable]}]')
#                 new_formula = new_formula.replace(variable, COMPLEX_OPERATIONS[variable])
#                 print(f'GOING TO APPEND => [{new_formula}]')
        new_formula = new_formula.replace(" ", "")
        for key, value in COMPLEX_OPERATIONS.items():
            if key in new_formula:
                new_formula = new_formula.replace(key, value)
        
        result.append(new_formula)
  
  return result

In [45]:
def execute_formula(formula_by_columns: list, data: pd.DataFrame) -> pd.DataFrame:
  '''
  Take every real formula and executes it via patsy dmatrix.
  Saves every formula result inside a new dataframe's column.
  '''
  new_df = data.copy()
     
  for formula_columns in formula_by_columns:
    result_items = []
    add_data = True
#     try:
    formula = "I("+formula_columns+")-1"
    result = dmatrix(formula, data, NA_action='raise')
    for item in result:
        result_items.append(item.item())
#     except:
#         # Ignore Patsy error.
#         add_data = False
        
    if add_data:
        if "np." in formula_columns:
            new_df[formula_columns.replace('np.', '')] = result_items
        else:
            new_df[formula_columns] = result_items
    else:
        print(f"{bcolors.WARNING}Your data has some invalid values. Script will ignore them and their possible result.{bcolors.ENDC}")
        
  return new_df

In [46]:
def execute(formula_input: str, data: pd.DataFrame) -> pd.DataFrame:
    
    formula = add_blank_spaces_to_formula(formula_input.lower())
    grouped_columns = group_columns(formula, data)
    replaceable_result = get_formula_by_columns(formula, grouped_columns)
    print(f'Got formula => {formula}')
    executable_formulas = parse_formula(formula, replaceable_result)
    new_data = execute_formula(executable_formulas, data)
    
    return new_data

## 4. Stage 2

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [10]:
data = pd.read_csv('glass.csv', delimiter=',')
data.isnull().sum().sum()
data

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


In [11]:
X = data.drop('Type', axis=1)
y = data['Type']

### Formulas

In [12]:
formula_a = "cos(a+b)"
formula_b = "cos(2*pi*(a-min(a)+b-min(b)))/(max(a)+max(b)-min(a)-min(b))"
formula_c = "a*b"

In [13]:
X_new = execute(formula_input=formula_a, data=X)
X_new

Got formula => cos (a +b ) 


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,cos(RI+Na),...,cos(Si+K),cos(Si+Ca),cos(Si+Ba),cos(Si+Fe),cos(K+Ca),cos(K+Ba),cos(K+Fe),cos(Ca+Ba),cos(Ca+Fe),cos(Ba+Fe)
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,-0.854113,...,-0.914457,0.407201,-0.888546,-0.888546,-0.816902,0.998201,0.998201,-0.780846,-0.780846,1.000000
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,-0.955232,...,-0.578939,0.434414,-0.890037,-0.890037,-0.440377,0.886995,0.886995,0.023979,0.023979,1.000000
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,-0.788898,...,-0.432648,0.612633,-0.742924,-0.742924,-0.310785,0.924909,0.924909,0.073914,0.073914,1.000000
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,-0.556771,...,-0.603136,0.658924,-0.938212,-0.938212,-0.805203,0.841901,0.841901,-0.357900,-0.357900,1.000000
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,-0.605388,...,-0.196147,0.862094,-0.679754,-0.679754,-0.693271,0.852525,0.852525,-0.214342,-0.214342,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,-0.998662,...,-0.907556,0.994110,-0.156778,-0.938212,-0.986455,0.417595,0.996802,-0.685707,-0.970191,0.488872
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,-0.745916,...,-0.694286,0.975589,0.732899,-0.694286,-0.519289,-0.019202,1.000000,-0.844470,-0.519289,-0.019202
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,-0.985127,...,-0.396249,0.984095,0.943346,-0.396249,-0.553048,-0.069148,1.000000,-0.792913,-0.553048,-0.069148
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,-0.982278,...,-0.215718,0.917682,0.976284,-0.215718,-0.585923,0.000796,1.000000,-0.810833,-0.585923,0.000796


In [15]:
X_new_ = execute(formula_input=formula_b, data=X)
X_new_

Got formula => cos (2 *pi * (a -min (a ) +b -min (b ) ) ) / (max (a ) +max (b ) -min (a ) -min (b ) ) 


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,cos(2*pi*(RI-min(RI)+Na-min(Na)))/(max(RI)+max(Na)-min(RI)-min(Na)),...,cos(2*pi*(Si-min(Si)+K-min(K)))/(max(Si)+max(K)-min(Si)-min(K)),cos(2*pi*(Si-min(Si)+Ca-min(Ca)))/(max(Si)+max(Ca)-min(Si)-min(Ca)),cos(2*pi*(Si-min(Si)+Ba-min(Ba)))/(max(Si)+max(Ba)-min(Si)-min(Ba)),cos(2*pi*(Si-min(Si)+Fe-min(Fe)))/(max(Si)+max(Fe)-min(Si)-min(Fe)),cos(2*pi*(K-min(K)+Ca-min(Ca)))/(max(K)+max(Ca)-min(K)-min(Ca)),cos(2*pi*(K-min(K)+Ba-min(Ba)))/(max(K)+max(Ba)-min(K)-min(Ba)),cos(2*pi*(K-min(K)+Fe-min(Fe)))/(max(K)+max(Fe)-min(K)-min(Fe)),cos(2*pi*(Ca-min(Ca)+Ba-min(Ba)))/(max(Ca)+max(Ba)-min(Ca)-min(Ba)),cos(2*pi*(Ca-min(Ca)+Fe-min(Fe)))/(max(Ca)+max(Fe)-min(Ca)-min(Fe)),cos(2*pi*(Ba-min(Ba)+Fe-min(Fe)))/(max(Ba)+max(Fe)-min(Ba)-min(Fe))
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,0.131262,...,8.317420e-02,-0.015201,1.122614e-01,1.607671e-01,-0.042956,0.099335,0.138360,-0.030610,-3.777988e-02,0.273224
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,0.075100,...,-6.850271e-02,-0.026026,1.001493e-01,1.434217e-01,0.042956,-0.105995,-0.147636,-0.058161,-7.178500e-02,0.273224
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,0.050791,...,-7.661533e-02,-0.060042,4.866049e-02,6.968565e-02,-0.003700,-0.082320,-0.114660,-0.042256,-5.215486e-02,0.273224
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,-0.149325,...,-5.796335e-02,-0.051609,3.531623e-02,5.057561e-02,-0.037562,-0.096670,-0.134647,0.017878,2.206654e-02,0.273224
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,-0.143574,...,3.605244e-02,0.051609,-1.432380e-02,-2.051280e-02,0.021693,-0.101609,-0.141526,-0.045825,-5.655936e-02,0.273224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,-0.129031,...,6.172469e-02,-0.058133,7.284846e-02,5.057561e-02,0.028389,0.068101,0.130403,0.026465,-2.391164e-16,0.254037
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,0.050143,...,-8.300901e-17,0.011454,6.123735e-02,-1.604479e-16,0.057884,-0.090206,0.148810,-0.066842,8.715947e-02,-0.230691
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,-0.095888,...,-6.524244e-02,-0.044558,7.559942e-16,-1.261069e-01,0.058811,-0.068101,0.148810,-0.042256,8.855605e-02,-0.174160
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,-0.083955,...,2.616571e-02,0.035928,-7.823395e-02,5.057561e-02,0.056043,-0.096670,0.148810,-0.052406,8.438833e-02,-0.247221


In [16]:
X_new_2 = execute(formula_input=formula_c, data=X)
X_new_2

Got formula => a *b 


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,RI*Na,...,Si*K,Si*Ca,Si*Ba,Si*Fe,K*Ca,K*Ba,K*Fe,Ca*Ba,Ca*Fe,Ba*Fe
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,20.746576,...,4.3068,628.0750,0.0000,0.0,0.5250,0.0000,0.0,0.0000,0.0,0.0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,21.079603,...,34.9104,569.4759,0.0000,0.0,3.7584,0.0000,0.0,0.0000,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,20.513915,...,28.4661,567.8622,0.0000,0.0,3.0342,0.0000,0.0,0.0000,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,20.048289,...,41.3877,596.8542,0.0000,0.0,4.6854,0.0000,0.0,0.0000,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,20.136163,...,40.1940,589.7556,0.0000,0.0,4.4385,0.0000,0.0,0.0000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,21.439492,...,5.8088,666.5598,76.9666,0.0,0.7344,0.0848,0.0,9.7308,0.0,0.0
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,22.631402,...,0.0000,613.7040,116.1654,0.0,0.0000,0.0000,0.0,13.3560,0.0,0.0
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,21.836534,...,0.0000,619.6648,120.4088,0.0,0.0000,0.0000,0.0,13.8416,0.0,0.0
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,21.807414,...,0.0000,624.2128,115.5677,0.0,0.0000,0.0000,0.0,13.3136,0.0,0.0


In [18]:
X_full= pd.concat([X_new, X_new_], axis=1).T.drop_duplicates().T
X_full

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,cos(RI+Na),...,cos(2*pi*(Si-min(Si)+K-min(K)))/(max(Si)+max(K)-min(Si)-min(K)),cos(2*pi*(Si-min(Si)+Ca-min(Ca)))/(max(Si)+max(Ca)-min(Si)-min(Ca)),cos(2*pi*(Si-min(Si)+Ba-min(Ba)))/(max(Si)+max(Ba)-min(Si)-min(Ba)),cos(2*pi*(Si-min(Si)+Fe-min(Fe)))/(max(Si)+max(Fe)-min(Si)-min(Fe)),cos(2*pi*(K-min(K)+Ca-min(Ca)))/(max(K)+max(Ca)-min(K)-min(Ca)),cos(2*pi*(K-min(K)+Ba-min(Ba)))/(max(K)+max(Ba)-min(K)-min(Ba)),cos(2*pi*(K-min(K)+Fe-min(Fe)))/(max(K)+max(Fe)-min(K)-min(Fe)),cos(2*pi*(Ca-min(Ca)+Ba-min(Ba)))/(max(Ca)+max(Ba)-min(Ca)-min(Ba)),cos(2*pi*(Ca-min(Ca)+Fe-min(Fe)))/(max(Ca)+max(Fe)-min(Ca)-min(Fe)),cos(2*pi*(Ba-min(Ba)+Fe-min(Fe)))/(max(Ba)+max(Fe)-min(Ba)-min(Fe))
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,-0.854113,...,8.317420e-02,-0.015201,1.122614e-01,1.607671e-01,-0.042956,0.099335,0.138360,-0.030610,-3.777988e-02,0.273224
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,-0.955232,...,-6.850271e-02,-0.026026,1.001493e-01,1.434217e-01,0.042956,-0.105995,-0.147636,-0.058161,-7.178500e-02,0.273224
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,-0.788898,...,-7.661533e-02,-0.060042,4.866049e-02,6.968565e-02,-0.003700,-0.082320,-0.114660,-0.042256,-5.215486e-02,0.273224
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,-0.556771,...,-5.796335e-02,-0.051609,3.531623e-02,5.057561e-02,-0.037562,-0.096670,-0.134647,0.017878,2.206654e-02,0.273224
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,-0.605388,...,3.605244e-02,0.051609,-1.432380e-02,-2.051280e-02,0.021693,-0.101609,-0.141526,-0.045825,-5.655936e-02,0.273224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,-0.998662,...,6.172469e-02,-0.058133,7.284846e-02,5.057561e-02,0.028389,0.068101,0.130403,0.026465,-2.391164e-16,0.254037
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,-0.745916,...,-8.300901e-17,0.011454,6.123735e-02,-1.604479e-16,0.057884,-0.090206,0.148810,-0.066842,8.715947e-02,-0.230691
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,-0.985127,...,-6.524244e-02,-0.044558,7.559942e-16,-1.261069e-01,0.058811,-0.068101,0.148810,-0.042256,8.855605e-02,-0.174160
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,-0.982278,...,2.616571e-02,0.035928,-7.823395e-02,5.057561e-02,0.056043,-0.096670,0.148810,-0.052406,8.438833e-02,-0.247221


In [19]:
X_full= pd.concat([X_full, X_new_2], axis=1).T.drop_duplicates().T
X_full

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,cos(RI+Na),...,Si*K,Si*Ca,Si*Ba,Si*Fe,K*Ca,K*Ba,K*Fe,Ca*Ba,Ca*Fe,Ba*Fe
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,-0.854113,...,4.3068,628.0750,0.0000,0.0,0.5250,0.0000,0.0,0.0000,0.0,0.0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,-0.955232,...,34.9104,569.4759,0.0000,0.0,3.7584,0.0000,0.0,0.0000,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,-0.788898,...,28.4661,567.8622,0.0000,0.0,3.0342,0.0000,0.0,0.0000,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,-0.556771,...,41.3877,596.8542,0.0000,0.0,4.6854,0.0000,0.0,0.0000,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,-0.605388,...,40.1940,589.7556,0.0000,0.0,4.4385,0.0000,0.0,0.0000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,-0.998662,...,5.8088,666.5598,76.9666,0.0,0.7344,0.0848,0.0,9.7308,0.0,0.0
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,-0.745916,...,0.0000,613.7040,116.1654,0.0,0.0000,0.0000,0.0,13.3560,0.0,0.0
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,-0.985127,...,0.0000,619.6648,120.4088,0.0,0.0000,0.0000,0.0,13.8416,0.0,0.0
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,-0.982278,...,0.0000,624.2128,115.5677,0.0,0.0000,0.0000,0.0,13.3136,0.0,0.0


## Check columns & rows numbers

In [20]:
rows, columns = X.shape
new_1 = len(X_new.columns) - columns
new_2 = len(X_new_.columns) - columns
new_3 = len(X_new_2.columns) - columns
print((columns + new_1 + new_2 + new_3) == len(X_full.columns))

print(X_new.shape[0] == rows and X_new_.shape[0] == rows and X_new_2.shape[0] == rows)

True
True


In [50]:
X_full.to_csv('new_glass.csv', index=False)
# X_full['Type'] = y

In [49]:
X_full

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,cos(RI+Na),...,Si*Ca,Si*Ba,Si*Fe,K*Ca,K*Ba,K*Fe,Ca*Ba,Ca*Fe,Ba*Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,-0.854113,...,628.0750,0.0000,0.0,0.5250,0.0000,0.0,0.0000,0.0,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,-0.955232,...,569.4759,0.0000,0.0,3.7584,0.0000,0.0,0.0000,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,-0.788898,...,567.8622,0.0000,0.0,3.0342,0.0000,0.0,0.0000,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,-0.556771,...,596.8542,0.0000,0.0,4.6854,0.0000,0.0,0.0000,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,-0.605388,...,589.7556,0.0000,0.0,4.4385,0.0000,0.0,0.0000,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,-0.998662,...,666.5598,76.9666,0.0,0.7344,0.0848,0.0,9.7308,0.0,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,-0.745916,...,613.7040,116.1654,0.0,0.0000,0.0000,0.0,13.3560,0.0,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,-0.985127,...,619.6648,120.4088,0.0,0.0000,0.0000,0.0,13.8416,0.0,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,-0.982278,...,624.2128,115.5677,0.0,0.0000,0.0000,0.0,13.3136,0.0,0.0,7


# Stage 2

In [35]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size = 0.2, random_state = 42)

## KNN

In [36]:
reg_knn = KNeighborsClassifier()
reg_knn.fit(X_train, y_train)
y_pred = reg_knn.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print("####### CROSS VAL SCORE #######")
print(cross_val_score(reg_knn, X_full, y, cv=10).mean())

              precision    recall  f1-score   support

           1       0.67      0.80      0.73         5
           2       0.86      0.75      0.80         8
           3       0.00      0.00      0.00         0
           5       1.00      1.00      1.00         2
           6       0.50      0.33      0.40         3
           7       0.75      0.75      0.75         4

    accuracy                           0.73        22
   macro avg       0.63      0.61      0.61        22
weighted avg       0.76      0.73      0.74        22

####### CROSS VAL SCORE #######
0.6363636363636365


## Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
reg_log = LogisticRegression()
reg_log.fit(X_train, y_train)
y_pred = reg_log.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print("####### CROSS VAL SCORE #######")
print(cross_val_score(reg_log, X_full, y, cv=10).mean())

              precision    recall  f1-score   support

           1       0.80      0.80      0.80         5
           2       0.78      0.88      0.82         8
           5       1.00      1.00      1.00         2
           6       1.00      0.67      0.80         3
           7       0.75      0.75      0.75         4

    accuracy                           0.82        22
   macro avg       0.87      0.82      0.83        22
weighted avg       0.83      0.82      0.82        22

####### CROSS VAL SCORE #######
0.6021645021645022


## SVM

In [28]:
from sklearn.svm import SVC

In [29]:
reg_svc = SVC(kernel='linear')
reg_svc.fit(X_train, y_train)
y_pred = reg_svc.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("####### CROSS VAL SCORE #######")
print(cross_val_score(reg_svc, X_full, y, cv=10).mean())

              precision    recall  f1-score   support

           1       0.60      0.82      0.69        11
           2       0.92      0.79      0.85        14
           3       0.00      0.00      0.00         3
           5       0.80      1.00      0.89         4
           6       1.00      0.67      0.80         3
           7       1.00      1.00      1.00         8

    accuracy                           0.79        43
   macro avg       0.72      0.71      0.70        43
weighted avg       0.78      0.79      0.78        43

####### CROSS VAL SCORE #######
0.6649350649350649


## Naive Bayes

In [30]:
from sklearn.naive_bayes import GaussianNB

In [31]:
reg_rf = GaussianNB()
reg_rf.fit(X_train, y_train)
y_pred = reg_rf.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print("####### CROSS VAL SCORE #######")
print(cross_val_score(reg_rf, X_full, y, cv=10).mean())

              precision    recall  f1-score   support

           1       0.47      0.64      0.54        11
           2       0.80      0.29      0.42        14
           3       0.50      1.00      0.67         3
           5       0.80      1.00      0.89         4
           6       1.00      0.67      0.80         3
           7       0.80      1.00      0.89         8

    accuracy                           0.65        43
   macro avg       0.73      0.76      0.70        43
weighted avg       0.71      0.65      0.63        43

####### CROSS VAL SCORE #######
0.42034632034632036
