# Kevin Mochi
# kevinmochi@outlook.com
# Modeling

In [4]:
import ast
import numpy as np
import warnings
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

def load_data(infile):
    # t = input limitation if desired
    f = open(infile, 'r')
    data = {}
    full = []
    attrib = []
    classes = []
    for i, line in enumerate(f.readlines()):
        if i > 0:
            line = line.replace('\n', '')
            line = line.split(',')
           #print('Data: ', i, line[1:])
           #print('Class: ', i, line[0])
            full.append(line)
            attrib.append([float(x) for x in line[1:]])
            classes.append(line[0])
    data['full'] = np.array(full)
    data['attrib'] = np.array(attrib)
    data['classes'] = np.array(classes)
    return data

# data load section
infile = 'forest_data.csv'
data = load_data(infile)
seed = 208

In [5]:
def Example_train_test_split(data, pct, seed):
    X_train, X_test, y_train, y_test = train_test_split(data['attrib'],
                                                        data['classes'],
                                                        test_size=pct,
                                                        random_state=seed)
    xydata = {}
    xydata['X_train'] = X_train
    xydata['X_test'] = X_test
    xydata['y_train'] = y_train
    xydata['y_test'] = y_test
    return xydata

In [None]:
pct = .25
xydata = Example_train_test_split(data, pct, seed)

In [15]:
xydata

{'X_train': array([[ 51.  ,  39.  ,  60.  , ..., -19.02,  -2.19,  -4.68],
        [ 82.  ,  29.  ,  54.  , ..., -24.09,  -1.2 ,  -3.95],
        [ 57.  ,  31.  ,  52.  , ..., -16.54,  -0.97,  -3.23],
        ...,
        [ 34.  ,  32.  ,  51.  , ..., -18.26,  -2.42,  -4.67],
        [ 49.  ,  25.  ,  50.  , ..., -27.16,  -2.84,  -5.1 ],
        [ 71.  ,  65.  ,  85.  , ..., -17.51,  -1.09,  -3.32]]),
 'X_test': array([[ 35.  ,  32.  ,  53.  , ..., -12.39,  -1.95,  -3.89],
        [ 71.  ,  29.  ,  55.  , ..., -12.95,  -0.73,  -3.06],
        [ 61.  ,  29.  ,  54.  , ..., -23.29,  -1.65,  -4.45],
        ...,
        [ 52.  ,  44.  ,  73.  , ..., -18.75,  -2.08,  -4.75],
        [ 54.  ,  52.  ,  71.  , ..., -18.89,  -1.74,  -4.44],
        [ 62.  ,  54.  ,  79.  , ..., -23.07,  -3.51,  -6.63]]),
 'y_train': array(['d ', 'h ', 's ', 's ', 's ', 's ', 'd ', 's ', 'd ', 's ', 'o ',
        's ', 's ', 'h ', 'd ', 's ', 'h ', 's ', 's ', 'd ', 's ', 's ',
        'o ', 's ', 'o ', 'd ', 's

In [1]:
# Task 2 - Random Forest

In [9]:
import ast
import numpy as np
import warnings
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [10]:
# Random Forest
def Example_random_forest(xydata, seed=208, depth=25):
    # Suppress package change warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        clf = RandomForestClassifier(max_depth=depth, random_state=seed)

        # Class Weight Implementation:
        # cw = {1: .1, 2: .1, 3: .1, 4: .3, 5: .1, 6: .5, 7: .4, 8: .1, 9: .1}
        # cw = 'balanced'  # default
        # clf = RandomForestClassifier(max_depth=depth, random_state=seed, class_weight=cw)

        clf.fit(xydata['X_train'], xydata['y_train'])
        score = clf.score(xydata['X_test'], xydata['y_test'])
        
        feature_importances = clf.feature_importances_
        result = {}
        result['score'] = score
        result['importance'] = feature_importances
        return result

In [11]:
result = Example_random_forest(xydata, seed, None)
print('\n-----Random Forest Classification-----')
print('SCORE: ', result['score'])
print('IMPORTANCE: ', result['importance'])


-----Random Forest Classification-----
SCORE:  0.8931297709923665
IMPORTANCE:  [0.02727185 0.03056933 0.06409157 0.05527341 0.12374512 0.04977714
 0.02736129 0.05779825 0.03257203 0.10150956 0.06944084 0.05318945
 0.06308308 0.06256394 0.01385378 0.0106069  0.06491038 0.02132081
 0.01176926 0.00837111 0.01166177 0.0051243  0.00993866 0.00568629
 0.00915534 0.00512009 0.00423445]


In [12]:
# Task 3 - XG Boost

In [16]:
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [38]:
def Example_xg_boost(xydata):
    model = XGBClassifier()
    model.fit(xydata['X_train'], xydata['y_train'])

In [39]:
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [19]:
model = XGBClassifier()
model.fit(xydata['X_train'], xydata['y_train'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [27]:
# make predictions for test data
warnings.simplefilter('ignore', DeprecationWarning)
y_pred = model.predict(xydata['X_test'])
predictions = [value for value in y_pred]

In [28]:
predictions[:10]

['d ', 'h ', 's ', 'd ', 'h ', 'h ', 'd ', 's ', 'o ', 'd ']

In [36]:
# evaluate
accuracy = accuracy_score(xydata['y_test'], predictions)
accuracy

0.8931297709923665

In [41]:
# xg boost
result = Example_xg_boost(xydata)
print('\n-----XG Boost Classification-----')
print('SCORE: ', result)


-----XG Boost Classification-----
SCORE:  None


In [42]:
def Example_xg_boost(xydata):
    model = XGBClassifier()
    model.fit(xydata['X_train'], xydata['y_train'])

    # make predictions for test data
    warnings.simplefilter('ignore', DeprecationWarning)
    y_pred = model.predict(xydata['X_test'])
    predictions = [value for value in y_pred]
    
    # evaluate
    accuracy = accuracy_score(xydata['y_test'], predictions)
    return accuracy

In [43]:
# xg boost
result = Example_xg_boost(xydata)
print('\n-----XG Boost Classification-----')
print('SCORE: ', result)


-----XG Boost Classification-----
SCORE:  0.8931297709923665
