In [37]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Flatten
from tensorflow.python.client import device_lib
import warnings
warnings.filterwarnings('ignore')
local_device_protos = device_lib.list_local_devices()
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 17349674148537413193
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 9883877376
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 6053309463988546351
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6"
 xla_global_id: 416903419]

In [38]:
tr_features = pd.read_csv('data/train_features.csv')
tr_labels = pd.read_csv('data/train_labels.csv')

te_features = pd.read_csv('data/test_features.csv')
te_labels = pd.read_csv('data/test_labels.csv')

# DECISION TREE

In [39]:
dt0 = DecisionTreeClassifier(criterion='entropy', max_depth=50, max_features=None, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=1, splitter='best')
dt0scores = cross_val_score(dt0, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(dt0scores)
dt0.fit(tr_features, tr_labels.values.ravel())

for mdl in [dt0]:
    y_pred = mdl.predict(te_features)
    accuracy = round(accuracy_score(te_labels, y_pred), 8)
    precision = round(precision_score(te_labels, y_pred, average='weighted'), 8)
    recall = round(recall_score(te_labels, y_pred, average='weighted'), 8)
    f1 = round(f1_score(te_labels, y_pred, average='weighted'), 8)
    print(f'MAX DEPTH: {mdl.max_depth} / MAX LEAF NODES: {mdl.max_leaf_nodes} / A: {accuracy} / P: {precision} / R: {recall} / F1: {f1}')

[0.36528457 0.37364023 0.372379   0.36418099 0.35708655]
MAX DEPTH: 50 / MAX LEAF NODES: 50 / A: 0.38289822 / P: 0.36930244 / R: 0.38289822 / F1: 0.35612188


# KNN

In [40]:
knn0 = KNeighborsClassifier(algorithm='auto', leaf_size=10, n_neighbors=1, p=1, weights='uniform')
knn0scores = cross_val_score(knn0, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(knn0scores)
knn0.fit(tr_features, tr_labels.values.ravel())

for mdl in [knn0]:
    y_pred = mdl.predict(te_features)
    accuracy = round(accuracy_score(te_labels, y_pred), 8)
    precision = round(precision_score(te_labels, y_pred, average='weighted'), 8)
    recall = round(recall_score(te_labels, y_pred, average='weighted'), 8)
    f1 = round(f1_score(te_labels, y_pred, average='weighted'), 8)
    print(f'LEAF SIZE: {mdl.leaf_size} / NEAREST NEIGHBOURS: {mdl.n_neighbors} / A: {accuracy} / P: {precision} / R: {recall} / F1: {f1}')

[0.29118714 0.27778654 0.28519628 0.28393505 0.28330443]
LEAF SIZE: 10 / NEAREST NEIGHBOURS: 1 / A: 0.285408 / P: 0.28590231 / R: 0.285408 / F1: 0.28561743


# RANDOM FOREST

In [41]:
rf0 = RandomForestClassifier(bootstrap=False, max_depth=100, max_features='sqrt', max_leaf_nodes=500, max_samples=None, min_samples_leaf=3, n_estimators=50, oob_score=False, warm_start=False)
rf0scores = cross_val_score(rf0, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(rf0scores)
rf0.fit(tr_features, tr_labels.values.ravel())

for mdl in [rf0]:
    y_pred = mdl.predict(te_features)
    accuracy = round(accuracy_score(te_labels, y_pred), 8)
    precision = round(precision_score(te_labels, y_pred, average='weighted'), 8)
    recall = round(recall_score(te_labels, y_pred, average='weighted'), 8)
    f1 = round(f1_score(te_labels, y_pred, average='weighted'), 8)
    print(f'A: {accuracy} / P: {precision} / R: {recall} / F1: {f1}')

[0.38231121 0.40059909 0.38420306 0.39350465 0.39019391]
A: 0.39916761 / P: 0.38972838 / R: 0.39916761 / F1: 0.38524394


# GRADIENT BOOST

In [42]:
gb0 = HistGradientBoostingClassifier(l2_regularization=0, learning_rate=0.1, max_depth=100, max_iter=100, max_leaf_nodes=16, min_samples_leaf=50)
gb0scores = cross_val_score(gb0, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(gb0scores)
gb0.fit(tr_features, tr_labels.values.ravel())

for mdl in [gb0]:
    y_pred = mdl.predict(te_features)
    accuracy = round(accuracy_score(te_labels, y_pred), 8)
    precision = round(precision_score(te_labels, y_pred, average='weighted'), 8)
    recall = round(recall_score(te_labels, y_pred, average='weighted'), 8)
    f1 = round(f1_score(te_labels, y_pred, average='weighted'), 8)
    print(f'MAX DEPTH: {mdl.max_depth} / MAX LEAF NODES: {mdl.max_leaf_nodes} / A: {accuracy} / P: {precision} / R: {recall} / F1: {f1}')

[0.38830207 0.39350465 0.3857796  0.3971307  0.39381996]
MAX DEPTH: 100 / MAX LEAF NODES: 16 / A: 0.39462732 / P: 0.38383943 / R: 0.39462732 / F1: 0.3822473


# BONUS: NEURAL NETWORK

In [43]:
scaler = MinMaxScaler()
tr_features = pd.DataFrame(scaler.fit_transform(tr_features))
te_features = pd.DataFrame(scaler.fit_transform(te_features))

model = Sequential([
    Dense(58),
    Dropout(0.5),
    Dense(64, activation='softmax'),
    Dropout(0.5),
    Dense(64, activation='softmax'),
    Dropout(0.5),
    Dense(64, activation='softmax'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])

In [44]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(tr_features, tr_labels, epochs=50, batch_size=1024)

Epoch 1/50
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epo

<keras.callbacks.History at 0x1dab6a161a0>

In [45]:
loss, accuracy = model.evaluate(te_features, te_labels)
print("Loss:", loss)
print("Accuracy:", accuracy)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Constant'
Loss: 1.332669734954834
Accuracy: 0.35502585768699646
