In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import wandb
import numpy as np
import pandas as pd

from fastprogress import master_bar, progress_bar
from sklearn.model_selection import train_test_split

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33m04janik[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
df = pd.read_csv('train.csv', low_memory=False)

In [4]:
columns = list(df.columns)

f_target, f_home_team_name, f_away_team_name, f_match_date = [[feature] for feature in columns[1:5]]
f_league_name, f_league_id, f_is_cup, f_home_team_coach_id, f_away_team_coach_id = [[feature] for feature in columns[5:10]]

f_home_team_history_match_date = columns[10:20]
f_home_team_history_is_play_home = columns[20:30]
f_home_team_history_is_cup = columns[30:40]
f_home_team_history_goal = columns[40:50]
f_home_team_history_opponent_goal = columns[50:60]
f_home_team_history_rating = columns[60:70]
f_home_team_history_opponent_rating = columns[70:80]
f_home_team_history_coach = columns[80:90]
f_home_team_history_league_id = columns[90:100]
f_away_team_history_match_date = columns[100:110]
f_away_team_history_is_play_home = columns[110:120]
f_away_team_history_is_cup = columns[120:130]
f_away_team_history_goal = columns[130:140]
f_away_team_history_opponent_goal = columns[140:150]
f_away_team_history_rating = columns[150:160]
f_away_team_history_opponent_rating = columns[160:170]
f_away_team_history_coach = columns[170:180]
f_away_team_history_league_id = columns[180:190]

In [5]:
features_shitty = f_match_date + f_home_team_coach_id + f_away_team_coach_id + f_home_team_history_match_date + f_home_team_history_coach + f_away_team_history_match_date + f_away_team_history_coach
features_boolean = f_is_cup + f_home_team_history_is_play_home + f_home_team_history_is_cup + f_away_team_history_is_play_home + f_away_team_history_is_cup
features_numerical = f_home_team_history_goal + f_home_team_history_opponent_goal + f_home_team_history_rating + f_home_team_history_opponent_rating + f_away_team_history_goal + f_away_team_history_opponent_goal + f_away_team_history_rating + f_away_team_history_opponent_rating
features_categorical = f_home_team_name + f_away_team_name + f_league_name + f_league_id + f_home_team_history_league_id + f_away_team_history_league_id

In [6]:
# drop NaN and duplicates
df = df.dropna()
df = df.drop_duplicates()

# data processing 
data_set = df[['id']]
data_set = data_set.join(pd.get_dummies(df['target']))
data_set = data_set.join(df[features_boolean].astype(int))
data_set = data_set.join(df[features_numerical].astype(float))
data_set = data_set.join(pd.get_dummies(df[features_categorical[0:3]]))

In [7]:
data_set.shape

(64117, 11037)

In [8]:
targets = data_set.columns[1:4]
features = data_set.columns[4:]

# transform dataframe to tensor
x = torch.tensor(data_set[features].values).float()
y = torch.tensor(data_set[targets].values).float()

In [9]:
class DataSet():
    
    def __init__(self,x,y):
        self.x_train = x
        self.y_train = y

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self,idx):
        return self.x_train[idx], self.y_train[idx]

In [10]:
# split data in train and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=22)

In [11]:
train_loader = torch.utils.data.DataLoader(DataSet(x_train,y_train), batch_size=128, shuffle=True)
test_loader =  torch.utils.data.DataLoader(DataSet(x_test,y_test), batch_size=128, shuffle=True)

In [12]:
class MLP(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(11033, 5000)
        self.fc2 = nn.Linear(5000, 1000)
        self.fc3 = nn.Linear(1000, 3)
        nn.init.kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc2.weight, nonlinearity='relu')
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.constant_(self.fc1.bias, 0.0)
        nn.init.constant_(self.fc2.bias, 0.0)
        nn.init.constant_(self.fc3.bias, 0.0)

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x

In [13]:
model = MLP()
model.float()

learning_rate = 0.003
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [14]:
def test_model(model):
    
    train_mode = model.training
    
    if train_mode:
        model.eval()
    
    confusion = np.zeros((3,3), dtype=np.int32)
    
    for inputs, labels in DataSet(x_test, y_test):
        outputs = model(inputs).detach().numpy()
        confusion[np.argmax(labels), np.argmax(outputs)] += 1

    total = np.sum(confusion)
    accuracy = np.sum(np.diag(confusion)) / total
    per_class_accuracy = np.diag(confusion) / np.sum(confusion, axis=1)
    
    if train_mode:
        model.train()

    return accuracy, per_class_accuracy, confusion

In [15]:
def train_model(model, epochs):
    
    model.train()
    
    # construct name
    model_name = model.__class__.__name__
    optimizer_name = optimizer.__class__.__name__
    run_name = f'{model_name}-{optimizer_name}-lr{learning_rate}'
    
    # init weight and biases
    with wandb.init(project='DLL-Project', name=run_name) as run:
        
        # log some info
        run.config.learning_rate = learning_rate
        run.config.optimizer = optimizer.__class__.__name__
        run.watch(model)
        
        # progress bar
        mb = master_bar(range(epochs))
        
        for epoch in mb:
            
            for inputs, labels in progress_bar(iter(train_loader), parent=mb):

                # set all parameter gradients to zero
                optimizer.zero_grad()

                # compute the forward pass
                outputs = model.forward(inputs)

                # compute loss and backpropagate gradients
                loss = criterion(outputs, labels)
                loss.backward()

                # update parameters
                optimizer.step()

                # log the loss
                run.log({'loss': loss})
                
            # evaluate the model
            accuracy, per_class_accuracy, confusion = test_model(model)
            mb.main_bar.comment = f'val acc:{accuracy}'

            # log the data
            run.log({'accuracy': accuracy, 'epoch': epoch})

In [16]:
train_model(model, 1)

  x = F.softmax(self.fc3(x))


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁
loss,█▆▅▅▃▄▅▂▅▅▇▅▄▁▄▃▃▅▅▄▄▂▃█▅▆▄▅▃▂▅▅▄▂▄▄▃▇▄▂

0,1
accuracy,0.4777
epoch,0.0
loss,1.07441


In [None]:
test_model(model)

  x = F.softmax(self.fc3(x))
