# Kevin Mochi
# kevinmochi@outlook.com
# Modeling

In [None]:
# Task 1 - load & Train-test slit

In [1]:
import ast
import numpy as np
import warnings
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
def load_data(infile):
    # t = input limitation if desired
    f = open(infile, 'r')
    data = {}
    full = []
    attrib = []
    classes = []
    for i, line in enumerate(f.readlines()):
        if i > 0:
            line = line.replace('\n', '')
            line = line.split(',')
           #print('Data: ', i, line[1:])
           #print('Class: ', i, line[0])
            full.append(line)
            attrib.append([float(x) for x in line[1:]])
            classes.append(line[0])
    data['full'] = np.array(full)
    data['attrib'] = np.array(attrib)
    data['classes'] = np.array(classes)
    return data

In [3]:
# data load section
infile = 'forest_data.csv'
data = load_data(infile)
seed = 208

In [4]:
data

{'full': array([['d ', '67', '51', ..., '-22.56', '-5.53', '-8.11'],
        ['s ', '67', '28', ..., '-22.2', '-3.41', '-6.57'],
        ['s ', '63', '26', ..., '-20.89', '-3.96', '-6.85'],
        ...,
        ['h ', '79', '30', ..., '-23.32', '-2.09', '-4.13'],
        ['h ', '69', '27', ..., '-10.04', '-0.74', '-2.88'],
        ['h ', '80', '29', ..., '-20.91', '-0.9', '-3.7']], dtype='<U7'),
 'attrib': array([[ 67.  ,  51.  ,  68.  , ..., -22.56,  -5.53,  -8.11],
        [ 67.  ,  28.  ,  51.  , ..., -22.2 ,  -3.41,  -6.57],
        [ 63.  ,  26.  ,  50.  , ..., -20.89,  -3.96,  -6.85],
        ...,
        [ 79.  ,  30.  ,  55.  , ..., -23.32,  -2.09,  -4.13],
        [ 69.  ,  27.  ,  53.  , ..., -10.04,  -0.74,  -2.88],
        [ 80.  ,  29.  ,  55.  , ..., -20.91,  -0.9 ,  -3.7 ]]),
 'classes': array(['d ', 's ', 's ', 'd ', 's ', 'd ', 'h ', 'o ', 's ', 'd ', 's ',
        'o ', 'd ', 's ', 'o ', 's ', 'o ', 'd ', 's ', 'o ', 'o ', 'd ',
        'd ', 's ', 's ', 's ', 'd ', '

In [10]:
def Example_train_test_split(data, pct, seed):
    X_train, X_test, y_train, y_test = train_test_split(data['attrib'],
                                                        data['classes'],
                                                        test_size=pct,
                                                        random_state=seed)
    xydata = {}
    xydata['X_train'] = X_train
    xydata['X_test'] = X_test
    xydata['y_train'] = y_train
    xydata['y_test'] = y_test
    return xydata

In [292]:
# simple train/test split
pct = .3
xydata = Example_train_test_split(data, pct, seed)

In [293]:
xydata

{'X_train': array([[ 59.  ,  32.  ,  56.  , ..., -16.59,  -1.25,  -3.73],
        [ 59.  ,  32.  ,  54.  , ..., -19.84,  -2.92,  -5.05],
        [ 83.  ,  28.  ,  54.  , ..., -29.34,  -2.  ,  -5.03],
        ...,
        [ 34.  ,  32.  ,  51.  , ..., -18.26,  -2.42,  -4.67],
        [ 49.  ,  25.  ,  50.  , ..., -27.16,  -2.84,  -5.1 ],
        [ 71.  ,  65.  ,  85.  , ..., -17.51,  -1.09,  -3.32]]),
 'X_test': array([[ 35.  ,  32.  ,  53.  , ..., -12.39,  -1.95,  -3.89],
        [ 71.  ,  29.  ,  55.  , ..., -12.95,  -0.73,  -3.06],
        [ 61.  ,  29.  ,  54.  , ..., -23.29,  -1.65,  -4.45],
        ...,
        [ 51.  ,  31.  ,  55.  , ..., -27.83,   0.87,  -1.31],
        [ 63.  ,  53.  ,  74.  , ..., -17.92,  -2.13,  -4.52],
        [ 50.  ,  36.  ,  60.  , ..., -17.74,  -1.9 ,  -4.48]]),
 'y_train': array(['s ', 's ', 'h ', 's ', 's ', 'd ', 's ', 'd ', 'h ', 's ', 's ',
        'h ', 'd ', 'o ', 'd ', 'h ', 'o ', 's ', 'o ', 's ', 'd ', 's ',
        'o ', 's ', 's ', 'o ', 'h

In [294]:
# Task 2 - Decision tree baseline

In [350]:
def baseline_algorithm(xydata, seed):
    clf = DecisionTreeClassifier(random_state=seed)
    clf.fit(xydata['X_train'], xydata['y_train'])
    score = clf.score(xydata['X_test'], xydata['y_test'])
    return score

In [351]:
score = baseline_algorithm(xydata, seed)

In [352]:
score

0.8343949044585988

In [353]:
# Task 3 - Bagging on decision tree

In [354]:
clf = BaggingClassifier(DecisionTreeClassifier(random_state=seed))
clf.fit(xydata['X_train'], xydata['y_train'])
score = clf.score(xydata['X_test'], xydata['y_test'])

In [355]:
score

0.8662420382165605

In [356]:
def Example_bagging(xydata, seed):
    clf = BaggingClassifier(DecisionTreeClassifier(random_state=seed))
    clf.fit(xydata['X_train'], xydata['y_train'])
    score = clf.score(xydata['X_test'], xydata['y_test'])

    # determine feature importance
    feature_importances = np.mean([tree.feature_importances_ for tree in clf.estimators_], axis=0)

    result = {}
    result['score'] = score
    result['importance'] = feature_importances
    return result

In [357]:
result = Example_bagging(xydata, seed)
print('\n-----Bagging Classification-----')
print('SCORE: ', result['score'])
print('IMPORTANCE: ', result['importance'])


-----Bagging Classification-----
SCORE:  0.8789808917197452
IMPORTANCE:  [0.07674654 0.29259676 0.01293795 0.1517982  0.01533242 0.02601688
 0.00956211 0.02187251 0.00978998 0.04527352 0.0410007  0.00732316
 0.03226675 0.0093362  0.00654155 0.01907138 0.04543914 0.12298161
 0.00800181 0.00748453 0.00427002 0.00734864 0.00658821 0.00505153
 0.00546859 0.00578714 0.00411217]


In [358]:
# Task 4 - Cross-validated bagging

In [359]:
# Cross Validation on Bagging
def Example_crossval_bagging(xydata, seed, cv=5):
    clf = BaggingClassifier(DecisionTreeClassifier(random_state=seed))
    clf.fit(xydata['X_train'], xydata['y_train'])
    scores = cross_val_score(clf, xydata['X_test'], xydata['y_test'], cv=cv)
    return scores

In [360]:
scores = Example_crossval_bagging(xydata, seed)

In [361]:
scores

array([0.78125   , 0.8125    , 0.84375   , 0.90322581, 0.86666667])

In [362]:
np.mean(scores)

0.8414784946236559

In [363]:
# Task 5 - Adaboost

In [364]:
# Boosting using Adaboost
def Example_adaboost(xydata, seed, n_est):
    base_dt = DecisionTreeClassifier(random_state=seed)
    clf = AdaBoostClassifier(
        base_estimator=base_dt,
        n_estimators=n_est,
        algorithm="SAMME")
    clf.fit(xydata['X_train'], xydata['y_train'])
    y_pred = clf.predict(xydata['X_test'])
    score = accuracy_score(y_pred, xydata['y_test'])
    return score

In [367]:
score = Example_adaboost(xydata, seed, 100)

In [368]:
score

0.8407643312101911