In [1]:
import pandas as pd

from keras.models import Sequential, Model
from keras.layers import Dense, Input
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, roc_curve, roc_auc_score, classification_report)
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedBaggingClassifier
import warnings 
warnings.filterwarnings('ignore')

In [15]:
#load dataset
data = pd.read_csv("data.csv")
#split into x and y sets
X = data.drop("position", axis=1).values
y = data.position.values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.16, random_state = 42)

In [19]:
import collections
counter = collections.Counter(y)
counter


Counter({np.int64(0): 968,
         np.int64(4): 949,
         np.int64(2): 517,
         np.int64(3): 414,
         np.int64(1): 364})

In [33]:
def getMetrics(model_name, pred):
     print(f"--- {model_name} ---")
     print("Test accuracy:", accuracy_score(y_test, pred))
     print("Precision:", precision_score(y_test, pred, average='macro'))
     print("Recall:", recall_score(y_test, pred, average='macro'))
     print("F1 Score:", f1_score(y_test, pred, average='macro'))
     print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
     print("Classification Report:\n", classification_report(y_test, pred))

In [34]:
decisionTree = DecisionTreeClassifier(random_state=42)
decisionTree.fit(X_train, y_train)
predictions = decisionTree.predict(X_test)

getMetrics("decision tree", predictions)

--- decision tree ---
Test accuracy: 0.669260700389105
Precision: 0.6555978615978615
Recall: 0.6121817543298747
F1 Score: 0.6242729591620069
Confusion Matrix:
 [[112   5   8   3  11]
 [ 14  24  10   0  13]
 [ 19   7  44  12  10]
 [  8   1   4  38  15]
 [ 15   0  12   3 126]]
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.81      0.73       139
           1       0.65      0.39      0.49        61
           2       0.56      0.48      0.52        92
           3       0.68      0.58      0.62        66
           4       0.72      0.81      0.76       156

    accuracy                           0.67       514
   macro avg       0.66      0.61      0.62       514
weighted avg       0.66      0.67      0.66       514



In [44]:
bagged = BaggingClassifier(estimator=decisionTree, n_estimators=25, max_samples=1200, random_state=42)
bagged.fit(X_train, y_train)
predictions = bagged.predict(X_test)

getMetrics("bagged decision tree", predictions)

--- bagged decision tree ---
Test accuracy: 0.6653696498054474
Precision: 0.6488206388206389
Recall: 0.6103172441137849
F1 Score: 0.6206538888036235
Confusion Matrix:
 [[111   5   8   3  12]
 [ 14  24  10   0  13]
 [ 17   7  43  15  10]
 [  8   1   3  39  15]
 [ 15   0  13   3 125]]
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.80      0.73       139
           1       0.65      0.39      0.49        61
           2       0.56      0.47      0.51        92
           3       0.65      0.59      0.62        66
           4       0.71      0.80      0.76       156

    accuracy                           0.67       514
   macro avg       0.65      0.61      0.62       514
weighted avg       0.66      0.67      0.66       514



In [46]:
balbagged = BalancedBaggingClassifier(estimator=decisionTree, n_estimators=25, max_samples=100, random_state=42)
balbagged.fit(X_train, y_train)
predictions = balbagged.predict(X_test)

getMetrics("balance bagged decision tree", predictions)

--- balance bagged decision tree ---
Test accuracy: 0.6439688715953308
Precision: 0.6162146379953776
Recall: 0.617783990214086
F1 Score: 0.611628165567496
Confusion Matrix:
 [[ 88  20   7   7  17]
 [ 10  34   7   0  10]
 [ 11  15  41  11  14]
 [  4   4   2  43  13]
 [  4   3  12  12 125]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.63      0.69       139
           1       0.45      0.56      0.50        61
           2       0.59      0.45      0.51        92
           3       0.59      0.65      0.62        66
           4       0.70      0.80      0.75       156

    accuracy                           0.64       514
   macro avg       0.62      0.62      0.61       514
weighted avg       0.65      0.64      0.64       514



In [36]:
randomForest = RandomForestClassifier(n_estimators=100, # Number of trees to train
                       criterion='gini', # How to train the trees. Also supports entropy.
                       max_depth=None, # Max depth of the trees. Not necessary to change.
                       min_samples_split=2, # Minimum samples to create a split.
                       min_samples_leaf=0.001, # Minimum samples in a leaf. Accepts fractions for %. This is 0.1% of sample.
                       min_weight_fraction_leaf=0.0, # Same as above, but uses the class weights.
                       max_features='sqrt', # Maximum number of features per split (not tree!) by default is sqrt(vars)
                       max_leaf_nodes=None, # Maximum number of nodes.
                       min_impurity_decrease=0.0001, # Minimum impurity decrease. This is 10^-3.
                       bootstrap=True, # If sample with repetition. For large samples (>100.000) set to false.
                       oob_score=True,  # If report accuracy with non-selected cases.
                    #    n_jobs=-1, # Parallel processing. Set to -1 for all cores. Watch your RAM!!
                       random_state=42, # Seed
                       verbose=1, # If to give info during training. Set to 0 for silent training.
                       warm_start=False, # If train over previously trained tree.
                       class_weight='balanced'
                                    )
randomForest.fit(X_train, y_train)

predictions = randomForest.predict(X_test)

getMetrics("random forest", predictions)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s


--- random forest ---
Test accuracy: 0.6517509727626459
Precision: 0.6256746130362754
Recall: 0.6387203049765801
F1 Score: 0.6296813986794431
Confusion Matrix:
 [[ 96  12  12  10   9]
 [  7  36   7   0  11]
 [ 12  11  45  17   7]
 [  5   1   3  47  10]
 [ 13   2  18  12 111]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.69      0.71       139
           1       0.58      0.59      0.59        61
           2       0.53      0.49      0.51        92
           3       0.55      0.71      0.62        66
           4       0.75      0.71      0.73       156

    accuracy                           0.65       514
   macro avg       0.63      0.64      0.63       514
weighted avg       0.66      0.65      0.65       514



[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [37]:
gradient = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
gradient.fit(X_train, y_train)

predictions = gradient.predict(X_test)

getMetrics("gradient boosting", predictions)

--- gradient boosting ---
Test accuracy: 0.6867704280155642
Precision: 0.7031314805371409
Recall: 0.6188715500542161
F1 Score: 0.6400673765417146
Confusion Matrix:
 [[110   0   7   2  20]
 [ 17  27   3   1  13]
 [ 17   7  42   8  18]
 [  8   2   3  33  20]
 [  8   1   5   1 141]]
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.79      0.74       139
           1       0.73      0.44      0.55        61
           2       0.70      0.46      0.55        92
           3       0.73      0.50      0.59        66
           4       0.67      0.90      0.77       156

    accuracy                           0.69       514
   macro avg       0.70      0.62      0.64       514
weighted avg       0.69      0.69      0.67       514



In [38]:

model = LogisticRegression(penalty='l2', max_iter=10000)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

getMetrics("logistic regression", predictions)

bagged = BaggingClassifier(estimator=model, n_estimators=50, random_state=42)
bagged.fit(X_train, y_train)

predictions = bagged.predict(X_test)

getMetrics("bagged logistic regression", predictions)

--- logistic regression ---
Test accuracy: 0.6789883268482491
Precision: 0.7017159892268148
Recall: 0.6213798281861579
F1 Score: 0.6422622452343815
Confusion Matrix:
 [[110   1   3   4  21]
 [ 15  26   2   1  17]
 [ 17   6  43   8  18]
 [  5   0   3  38  20]
 [ 10   1  10   3 132]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.79      0.74       139
           1       0.76      0.43      0.55        61
           2       0.70      0.47      0.56        92
           3       0.70      0.58      0.63        66
           4       0.63      0.85      0.73       156

    accuracy                           0.68       514
   macro avg       0.70      0.62      0.64       514
weighted avg       0.69      0.68      0.67       514

--- bagged logistic regression ---
Test accuracy: 0.6789883268482491
Precision: 0.7014726312975574
Recall: 0.6215366258249699
F1 Score: 0.6423033213368614
Confusion Matrix:
 [[111   1   3   4  20]
 [ 15  

In [None]:
#create a sequential model object
model = Sequential()
#add an input layer specifying 10 units and a sigmoid activation function
#note: input shape must be specified on the first layer and should be equal to the number of features
model.add(Dense(5, input_shape=(X_train_scaled.shape[1],), activation = 'sigmoid'))
#add a hidden layer with 5 units and a sigmoid activation function
model.add(Dense(5, activation='sigmoid'))
#add an output layer with 3 hidden units, one for each of the possible predictions with a sigmoid activation
model.add(Dense(5, activation='sigmoid'))