In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import xgboost as xgb
from copy import deepcopy
from tqdm import tqdm
import time
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [2]:
train_home_team_statistics_df = pd.read_csv('./Train_Data/train_home_team_statistics_df.csv', index_col=0)
train_away_team_statistics_df = pd.read_csv('./Train_Data/train_away_team_statistics_df.csv', index_col=0)


train_scores = pd.read_csv('./Y_train_1rknArQ.csv', index_col=0)

train_home = train_home_team_statistics_df.iloc[:,2:]
train_away = train_away_team_statistics_df.iloc[:,2:]

train_home.columns = 'HOME_' + train_home.columns
train_away.columns = 'AWAY_' + train_away.columns

train_data =  pd.concat([train_home,train_away],join='inner',axis=1)
train_scores = train_scores.loc[train_data.index]

train_data = train_data.replace({np.inf:np.nan,-np.inf:np.nan})

In [3]:
train_scores.head()

Unnamed: 0_level_0,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,1
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0


We try to model if AWAY wins

In [4]:
train_new_y = train_scores['AWAY_WINS']

In [5]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, train_new_y, train_size=0.8, random_state=42)
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_train, y_train, train_size=0.8, random_state=42)

Preprocessing (replace with real preprocessing steps later)

In [6]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

X_train_tensor = torch.tensor(X_train.fillna(0).values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid.fillna(0).values, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define input size, hidden size, and number of classes
input_size = X_train.shape[1]
hidden_size = 32
num_classes = 2

model = NeuralNet(input_size, hidden_size, num_classes)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

# Training the model
num_epochs = 20
early_stopping_patience = 5
best_valid_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        l2_lambda = 0.05
        l2_reg = sum(param.pow(2.0).sum() for param in model.parameters())
        loss += l2_lambda * l2_reg
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    valid_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in valid_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            valid_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    valid_loss /= len(valid_loader)
    accuracy = 100 * correct / total
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {valid_loss:.4f}, Accuracy: {accuracy:.2f}%')
    
    # Check for improvement
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break
    
    scheduler.step()  # Adjust the learning rate


Epoch [1/20], Loss: 0.5734, Accuracy: 71.15%
Epoch [2/20], Loss: 0.5552, Accuracy: 72.02%
Epoch [3/20], Loss: 0.5559, Accuracy: 72.52%
Epoch [4/20], Loss: 0.5600, Accuracy: 71.71%
Epoch [5/20], Loss: 0.5922, Accuracy: 69.63%
Epoch [6/20], Loss: 0.5645, Accuracy: 72.07%
Epoch [7/20], Loss: 0.5539, Accuracy: 72.42%
Epoch [8/20], Loss: 0.5791, Accuracy: 71.25%
Epoch [9/20], Loss: 0.5571, Accuracy: 72.12%
Epoch [10/20], Loss: 0.5616, Accuracy: 72.47%
Epoch [11/20], Loss: 0.5607, Accuracy: 72.22%
Epoch [12/20], Loss: 0.5570, Accuracy: 71.86%
Early stopping triggered.


Accuracy

In [7]:
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

X_test_tensor = torch.tensor(X_test.fillna(0).values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
prediction_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

correct_predictions = 0
total_predictions = 0

model.eval()
with torch.no_grad():
    for inputs, labels in prediction_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

prediction_accuracy = 100 * correct_predictions / total_predictions
print(f'Accuracy of the model on the prediction set: {prediction_accuracy:.2f}%')

Accuracy of the model on the prediction set: 70.87%


Submission

In [8]:
test_home = pd.read_csv('./Test_Data/test_home_team_statistics_df.csv', index_col=0)
test_away = pd.read_csv('./Test_Data/test_away_team_statistics_df.csv', index_col=0)

test_home.columns = 'HOME_' + test_home.columns
test_away.columns = 'AWAY_' + test_away.columns
test_data = pd.concat([test_home, test_away], join='inner', axis=1)

test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

X_test_tensor = torch.tensor(test_data.fillna(0).values, dtype=torch.float32)

model.load_state_dict(torch.load('best_model.pth'))

model.eval()

test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

predictions = []

# Make predictions on the test set
with torch.no_grad():
    for inputs in test_loader:
        outputs = model(inputs[0])
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())

predictions_df = pd.DataFrame(predictions, index=test_data.index, columns=['AWAY_WINS'])

submission = pd.DataFrame(index=test_data.index)
submission['HOME_WINS'] = (predictions_df['AWAY_WINS'] == 0).astype(int)
submission['DRAW'] = 0  # Assuming no draws in the predictions
submission['AWAY_WINS'] = predictions_df['AWAY_WINS']

submission.to_csv('./Test_Data/neural_network_submission.csv', index=True)