# Kevin Mochi
# kevinmochi@outlook.com
# Modeling

In [None]:
# Task 1 Data preparation

In [1]:
import ast
import numpy as np
import warnings

In [35]:
def load_data(infile):
    # t = input limitation if desired
    f = open(infile, 'r')
    data = {}
    full = []
    attrib = []
    classes = []
    for i, line in enumerate(f.readlines()):
        if i > 0:
            line = line.replace('\n', '')
            line = line.split(',')
           #print('Data: ', i, line[1:])
           #print('Class: ', i, line[0])
            full.append(line)
            attrib.append([float(x) for x in line[1:]])
            classes.append(line[0])
    data['full'] = np.array(full)
    data['attrib'] = np.array(attrib)
    data['classes'] = np.array(classes)
    return data

In [36]:
fd = load_data('forest_data (1).csv')

In [37]:
fd

{'full': array([['d ', '67', '51', ..., '-22.56', '-5.53', '-8.11'],
        ['s ', '67', '28', ..., '-22.2', '-3.41', '-6.57'],
        ['s ', '63', '26', ..., '-20.89', '-3.96', '-6.85'],
        ...,
        ['h ', '79', '30', ..., '-23.32', '-2.09', '-4.13'],
        ['h ', '69', '27', ..., '-10.04', '-0.74', '-2.88'],
        ['h ', '80', '29', ..., '-20.91', '-0.9', '-3.7']], dtype='<U7'),
 'attrib': array([[ 67.  ,  51.  ,  68.  , ..., -22.56,  -5.53,  -8.11],
        [ 67.  ,  28.  ,  51.  , ..., -22.2 ,  -3.41,  -6.57],
        [ 63.  ,  26.  ,  50.  , ..., -20.89,  -3.96,  -6.85],
        ...,
        [ 79.  ,  30.  ,  55.  , ..., -23.32,  -2.09,  -4.13],
        [ 69.  ,  27.  ,  53.  , ..., -10.04,  -0.74,  -2.88],
        [ 80.  ,  29.  ,  55.  , ..., -20.91,  -0.9 ,  -3.7 ]]),
 'classes': array(['d ', 's ', 's ', 'd ', 's ', 'd ', 'h ', 'o ', 's ', 'd ', 's ',
        'o ', 'd ', 's ', 'o ', 's ', 'o ', 'd ', 's ', 'o ', 'o ', 'd ',
        'd ', 's ', 's ', 's ', 'd ', '

In [38]:
# Task 2 Training set - Test set Split

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
def split_data(data, tsize, rnd, shfl=True):
    # split_data(DATA, TEST SET SIZE, RANDOM SEED, BOOL SHUFFLE
    outdata = {}
    outdata['X_train'], outdata['X_test'], outdata['y_train'], outdata['y_test'] = train_test_split(data['attrib'], data['classes'], test_size=tsize, random_state=rnd, shuffle=shfl)
    return outdata

In [50]:
 splitd = split_data(fd, .25, 208, True)

In [51]:
[x for x in splitd]

['X_train', 'X_test', 'y_train', 'y_test']

In [52]:
len(splitd['X_train'])

392

In [53]:
len(splitd['X_test'])

131

In [47]:
# Task 3 K-fold Validation

In [54]:
from sklearn.model_selection import KFold

In [55]:
def Example_k_fold(data, k, rnd, shfl=True):
    # KFold(n_splits=3, random_state=None, shuffle=False
    kf = KFold(n_splits=k, random_state=rnd, shuffle=shfl)
    kfold_sets = {}
    X = data['attrib']
    y = data['classes']
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        set_label = 'SET%i' % i
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # print("SET: ", i, "TRAIN:", train_index, "TEST:", test_index)
        kfold_sets[set_label] = {'X_train': X_train,
                                 'y_train': y_train,
                                 'X_test': X_test,
                                 'y_test': y_test}
    return kfold_sets

In [59]:
k = 5
kfoldd = Example_k_fold(fd, k, 208, True)

In [60]:
kfoldd

{'SET0': {'X_train': array([[ 67.  ,  51.  ,  68.  , ..., -22.56,  -5.53,  -8.11],
         [ 67.  ,  28.  ,  51.  , ..., -22.2 ,  -3.41,  -6.57],
         [ 46.  ,  27.  ,  50.  , ..., -22.19,  -4.45,  -7.32],
         ...,
         [ 79.  ,  30.  ,  55.  , ..., -23.32,  -2.09,  -4.13],
         [ 69.  ,  27.  ,  53.  , ..., -10.04,  -0.74,  -2.88],
         [ 80.  ,  29.  ,  55.  , ..., -20.91,  -0.9 ,  -3.7 ]]),
  'y_train': array(['d ', 's ', 's ', 'h ', 's ', 'd ', 'o ', 'd ', 's ', 'o ', 's ',
         'o ', 'd ', 's ', 'o ', 'o ', 's ', 's ', 'd ', 's ', 'd ', 's ',
         'o ', 's ', 's ', 'd ', 'd ', 'd ', 'd ', 'h ', 'o ', 'd ', 's ',
         'o ', 'o ', 's ', 'd ', 'h ', 'o ', 'd ', 'h ', 'h ', 'd ', 's ',
         'o ', 's ', 's ', 's ', 's ', 's ', 's ', 's ', 's ', 'd ', 's ',
         'd ', 'o ', 's ', 'h ', 'h ', 's ', 's ', 's ', 's ', 'd ', 's ',
         's ', 'h ', 's ', 's ', 'o ', 'd ', 's ', 'o ', 'd ', 's ', 's ',
         's ', 'd ', 'd ', 'd ', 'd ', 'h ', 