In [1]:
import sys  
sys.path.insert(0, '..')

In [2]:
import pandas as pd
import torch

Loading prepared dataset with one-hot encoded columns of the order

In [5]:
one_hot_df = pd.read_parquet('data/one-hot-artificial.parquet')
one_hot_df["vote_for_itself"] = one_hot_df["vote_for_itself"].astype(int)

In [6]:
data_columns = 28
print("Dataset length: " + str(len(one_hot_df.index)))

Dataset length: 47007


Dataset preview. Getting just one single round

In [7]:
single_round_df = one_hot_df[(one_hot_df['year'] == 2013) & 
                             (one_hot_df['round'] == 'final') &
                             (one_hot_df['from_country_id'] == 'nl')]
single_round_df.head(30)

Unnamed: 0,year,round,from_country_id,to_country_id,from_country,to_country,total_points,tele_points,jury_points,song_id,...,v18,v19,v20,v21,v22,v23,v24,v25,v26,v27
35338,2013,final,nl,dk,Netherlands,Denmark,10,0.0,0.0,dk2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35339,2013,final,nl,az,Netherlands,Azerbaijan,2,0.0,0.0,az2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35340,2013,final,nl,ua,Netherlands,Ukraine,5,0.0,0.0,ua2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35341,2013,final,nl,no,Netherlands,Norway,6,0.0,0.0,no2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35342,2013,final,nl,ru,Netherlands,Russia,4,0.0,0.0,ru2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35343,2013,final,nl,gr,Netherlands,Greece,1,0.0,0.0,gr2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35344,2013,final,nl,it,Netherlands,Italy,0,0.0,0.0,it2013,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35345,2013,final,nl,mt,Netherlands,Malta,8,0.0,0.0,mt2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35346,2013,final,nl,nl,Netherlands,Netherlands,0,0.0,0.0,nl2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35347,2013,final,nl,hu,Netherlands,Hungary,7,0.0,0.0,hu2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Preparing dataset. We prepare torch vectors for each round of competition and single judge

In [8]:
class OneHotSet(torch.utils.data.IterableDataset):
    def __init__(self, dataframe, train_n):
        self.df = dataframe
        self.train_n = train_n
        
        self.competitions = []
        years = self.df.year.unique()
        for year in years:
            year_df = self.df[self.df['year'] == year]
            rounds = year_df['round'].unique()
            for rnd in rounds:
                round_df = self.df[(self.df['year'] == year) & (self.df['round'] == rnd)]
                judges = round_df['from_country_id'].unique()
                for judge in judges:
                    df = self.df[(self.df['year'] == year) & 
                                 (self.df['round'] == rnd) &
                                 (self.df['from_country_id'] == judge)]
                    
                    number = len(df.index)
                    inputColumns = ['v' + str(i) for i in range(data_columns)]
                    
                    inputs = torch.tensor(df[inputColumns].values, dtype=torch.float32)
                    targets = torch.tensor(df['place'].values, dtype=torch.long) + 1
                    vfi = torch.tensor(df['vote_for_itself'].values, dtype=torch.long)
                    
                    self.competitions.append(
                        (inputs, targets, vfi)
                    )
        
        print("Ds size: " + str(len(self.competitions)))
    
    def __iter__(self):
        def dataset_iterator():
            while True:
                for (inputs, targets, vfi) in self.competitions[:self.train_n]:
                    yield inputs, targets, vfi
        
        return dataset_iterator()
    
    def test_iter(self):
        def test_iterator():
            while True:
                for (inputs, targets, vfi) in self.competitions[self.train_n:]:
                    yield inputs, targets, vfi
        
        return test_iterator()
            

In [11]:
def get_sort_order(scores):
    s = torch.argsort(scores, descending=True)
    r = torch.zeros(scores.shape, dtype=torch.long)
    for i in range(scores.shape[-1]):
        r[0, s[0, i]] = i
    return r + 1

Loading the dataset into memory

In [9]:
from torch.utils.data import DataLoader
dataset = OneHotSet(one_hot_df, 1500)

Ds size: 2197


In [10]:
loader_iterator = iter(DataLoader(dataset))
test_iterator = iter(dataset.test_iter())

In [13]:
from torch.utils.tensorboard import SummaryWriter
from linear_model import LinearModel
from exploded_logit import ExplodedLogitLoss

loss_type = 'nll' # bce or nll

writer = SummaryWriter('runs/one-hot/' + loss_type)

dataset_size = 150_000
test_dataset_size = 2196-1500

linear_model = LinearModel(data_columns, 1)  # number of columns to score
optimizer = torch.optim.Adam(params=linear_model.parameters())
loss = ExplodedLogitLoss(loss_type=loss_type)

for step in range(dataset_size):
    data, order, vfi = next(loader_iterator)
    optimizer.zero_grad()

    score = linear_model(data).squeeze(-1)

    loss_value = loss(score, order)
    loss_value.backward()
    optimizer.step()
    
    writer.add_scalar('training loss',loss_value.item(), step)
    
    if step % 5000 == 0:
        print("Loss value: {0}".format(loss_value.item()))

with torch.no_grad():
    print_i = 0
    t_equal = 0
    for _ in range(test_dataset_size):
        data, expected_order, vfi = next(test_iterator)
        data = data.unsqueeze(0)
        expected_order = expected_order.unsqueeze(0)

        score = linear_model(data).squeeze(-1)
        actual_order = get_sort_order(score)

        if (not torch.equal(actual_order, expected_order)) and print_i < 10:
            print("Order not equal:\n{0}\n{1}\n\n".format(actual_order, expected_order))
            print_i += 1
        else:
            t_equal += 1


print("Finished. Equal " + str(t_equal))

Loss value: 15.260576670492824
Loss value: 20.774055578889765
Loss value: 17.667371174546993
Loss value: 3.968364132596349
Loss value: 6.93767133410752
Loss value: 7.047009445506877
Loss value: 1.596030179752896
Loss value: 3.0851244042278143
Loss value: 3.497842270429946
Loss value: 0.7338079988916228
Loss value: 1.508436155125032
Loss value: 1.8926510444735438
Loss value: 0.36128834850441166
Loss value: 0.7749704718933507
Loss value: 1.0393886122875087
Loss value: 0.18495275252489368
Loss value: 0.4075963615707011
Loss value: 0.5643931001569448
Loss value: 0.09682347639610427
Loss value: 0.21651253406802712
Loss value: 0.30437894125838355
Loss value: 0.05125488071582879
Loss value: 0.11556139313210398
Loss value: 0.16451588696930391
Loss value: 0.027307633933590283
Loss value: 0.06198568008529801
Loss value: 0.08916770412488367
Loss value: 0.014626963597987725
Loss value: 0.03335083886857354
Loss value: 0.04832695557314936
Finished. Equal 696


In [14]:
%load_ext tensorboard

In [16]:
%tensorboard --logdir="runs/one-hot"

Reusing TensorBoard on port 6006 (pid 22461), started 0:00:34 ago. (Use '!kill 22461' to kill it.)

In [17]:
linear_model.linear.weight

Parameter containing:
tensor([[142.8851, 135.8377, 128.7915, 121.7454, 114.6995, 107.6536, 100.6080,
          93.5624,  86.5169,  79.4715,  72.5291,  65.6617,  58.8690,  52.1369,
          45.4068,  38.6767,  32.0007,  25.6511,  19.4519,  13.6204,   7.9125,
           2.2812,  -3.1494,  -8.2682, -12.9904, -15.9294, -18.8516, -21.7499]],
       requires_grad=True)

In [18]:
linear_model.linear.bias

Parameter containing:
tensor([-0.1325], requires_grad=True)