In [1]:
import pandas as pd
import tensorflow as tf

In [2]:
df = pd.read_csv("main_database_diffs_for_models.csv")

In [3]:
df.head(2)

Unnamed: 0,season,target,PosDiff,GSPGDiff,GCPGDiff,CSPGDiff,PtsPGDiff,SPGDiff,STPGDiff,S%Diff,PTSPG3Diff,PTSPG5Diff,PTSPG7Diff,GD3Diff,GD5Diff,GD7Diff,PosLSDiff,SpentDiff,IncomeDiff,NetDiff
0,9,2,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.65,-0.841,-1.491
1,9,2,-15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,38.23,-3.33,-41.56


In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

def get_train_test_split(data):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data, data["target"]):
        data_train = data.loc[train_index]
        data_test = data.loc[test_index]
    return data_train, data_test

train,test = get_train_test_split(df)

X_train = train.drop('target',axis=1)
y_train = train['target']
X_test = test.drop('target',axis=1)
y_test = test['target']

In [5]:
X_train.head(2)

Unnamed: 0,season,PosDiff,GSPGDiff,GCPGDiff,CSPGDiff,PtsPGDiff,SPGDiff,STPGDiff,S%Diff,PTSPG3Diff,PTSPG5Diff,PTSPG7Diff,GD3Diff,GD5Diff,GD7Diff,PosLSDiff,SpentDiff,IncomeDiff,NetDiff
6206,25,-6.0,0.91,-0.33,0.0,0.83,2.25,1.0,0.06,0.667,0.4,0.572,1.667,0.2,0.572,-14.0,59.68,-13.62,-73.3
40,9,2.0,-0.42,0.75,-0.33,-0.5,-4.25,-1.917,0.048,0.0,0.0,0.0,-0.667,0.0,0.0,11.0,-30.74,12.73,43.47


In [6]:
X_train_norm = tf.keras.utils.normalize(X_train.values,axis=1)
X_test_norm = tf.keras.utils.normalize(X_test.values,axis=1)
X_train_norm[0]

array([ 2.50139279e-01, -6.00334271e-02,  9.10506977e-03, -3.30183849e-03,
        0.00000000e+00,  8.30462408e-03,  2.25125352e-02,  1.00055712e-02,
        6.00334271e-04,  6.67371598e-03,  4.00222847e-03,  5.72318671e-03,
        1.66792872e-02,  2.00111424e-03,  5.72318671e-03, -1.40077997e-01,
        5.97132488e-01, -1.36275879e-01, -7.33408367e-01])

In [7]:
model = tf.keras.models.Sequential()
#model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(12, activation = tf.nn.relu))
model.add(tf.keras.layers.Dense(12, activation = tf.nn.relu))
#model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(3, activation = tf.nn.softmax))

In [8]:
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

class_weight = {0: 1.688,
                1: 1.747,
                2: 1}

In [9]:
model.fit(X_train_norm, y_train.values, epochs=20, validation_data=(X_test_norm,y_test), class_weight=class_weight)

Train on 5168 samples, validate on 1292 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f5984a766d8>

In [10]:
train_pred = model.predict_classes(X_train_norm)
test_pred = model.predict_classes(X_test_norm)

import numpy as np

unique, counts = np.unique(train_pred, return_counts=True)
print("preds: " + str(dict(zip(unique, counts))))

unique, counts = np.unique(y_train.values, return_counts=True)
print("real: " + str(dict(zip(unique, counts))))

from sklearn.metrics import confusion_matrix as conf_matrix

print(conf_matrix(train_pred,y_train.values,labels=[0,1,2]))

preds: {0: 2306, 1: 725, 2: 2137}
real: {0: 1444, 1: 1322, 2: 2402}
[[ 975  601  730]
 [ 190  232  303]
 [ 279  489 1369]]


In [11]:
print(conf_matrix(test_pred,y_test.values))

[[232 148 175]
 [ 41  59  80]
 [ 88 123 346]]


In [12]:
test_pred_probs = model.predict(X_test_norm)

In [13]:
test_pred_probs[0]

array([0.32085216, 0.2987421 , 0.38040578], dtype=float32)