In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
import pandas as pd
from tqdm import tqdm, trange
from matplotlib import pyplot as plt 
import networkx as nx
import numpy as np

%matplotlib inline

In [5]:
import os.path as osp

import torch
from torch_geometric.data import Dataset, download_url


class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)
        self.raw_dir
        self.processed_dir

    @property
    def raw_file_names(self):
        return ["CR_graph_rel.csv"]

    @property
    def processed_file_names(self):
        return ['data_0.pt']

    def download(self):
        # Download to `self.raw_dir`.
        pass
        
    
    def process(self):
        d = {
            'мордвины': torch.tensor(0, dtype=torch.long),
            'белорусы': torch.tensor(1, dtype=torch.long),
            'украинцы': torch.tensor(2, dtype=torch.long),
            'южные-русские': torch.tensor(3, dtype=torch.long),
            'северные-русские': torch.tensor(4, dtype=torch.long)
        }
        oho = {
            'мордвины': torch.tensor([1, 0, 0, 0, 0], dtype=torch.float),
            'белорусы': torch.tensor([0, 1, 0, 0, 0], dtype=torch.float),
            'украинцы': torch.tensor([0, 0, 1, 0, 0], dtype=torch.float),
            'южные-русские': torch.tensor([0, 0, 0, 1, 0], dtype=torch.float),
            'северные-русские': torch.tensor([0, 0, 0, 0, 1], dtype=torch.float)
        }
        
        idx = 0
        for raw_path in self.raw_paths:
            edge_index = []
            edge_attr = []
            
            y_labels = {}
            oho_labels = {}
            
            dataset_csv = pd.read_csv(raw_path)
            
            for index, row in tqdm(dataset_csv.iterrows()):
                node1 = row["node_id1"]
                node2 = row["node_id2"]
                label1 = row["label_id1"]
                label2 = row["label_id2"]
                ibd_sum = row["ibd_sum"]

                id1 = int(node1[5:])
                id2 = int(node2[5:])

                edge_index.append(torch.tensor([id1, id2], dtype=torch.long))
                edge_attr.append(torch.tensor([ibd_sum], dtype=torch.float))

                if id1 not in oho_labels:
                    y_labels[id1] = d[label1]
                    oho_labels[id1] = oho[label1]
                if id2 not in oho_labels:
                    y_labels[id2] = d[label2]
                    oho_labels[id2] = oho[label2]
            
            y_labels = dict(sorted(y_labels.items()))
            y = torch.stack(list(y_labels.values()))
            
            oho_labels = dict(sorted(oho_labels.items()))
            x = torch.stack(list(oho_labels.values()))
            edge_attr = torch.stack(edge_attr).contiguous()
            edge_index = torch.stack(edge_index).t().contiguous()

            data = Data(x=x ,edge_index=edge_index, edge_attr=edge_attr, y=y)
            torch.save(data, osp.join(self.processed_dir, f'data_{idx}.pt'))
            idx += 1

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, f'data_{idx}.pt'))
        return data

In [6]:
dataset = MyOwnDataset(root="data/")

In [7]:
data = dataset.get(0)

In [8]:
data

Data(x=[3767, 5], edge_index=[2, 67503], edge_attr=[67503, 1], y=[3767])

In [19]:
torch.tensor([154.97142857,   5.47142857,   4.97142857,  53.1       ,
         38.24285714])

tensor([154.9714,   5.4714,   4.9714,  53.1000,  38.2429])

In [None]:
torch.concat

In [31]:
d = {0: torch.tensor([0.60357202, 0.02130974, 0.01936238, 0.20681022, 0.14894564]),
 1: torch.tensor([0.00200091, 0.49544179, 0.0547664 , 0.35631331, 0.09147759]),
 2: torch.tensor([0.002216  , 0.06675369, 0.56663271, 0.29599465, 0.06840295]),
 3: torch.tensor([0.00323319, 0.05932558, 0.04043269, 0.8123698 , 0.08463874]),
 4: torch.tensor([0.01200443, 0.07851983, 0.04817019, 0.43633885, 0.4249667 ])}

In [34]:
res = []
for element in data.x:
    res.append(torch.concat((element, d[torch.argmax(element).item()]), dim=-1))

res = torch.stack(res).contiguous()

big_data = Data(x=res, edge_index=data.edge_index, edge_attr=data.edge_attr, y=data.y)

In [36]:
from torch_geometric.transforms import RandomNodeSplit

In [37]:
transform = RandomNodeSplit(split='train_rest', num_splits=1,
                                num_val=0.0, num_test=0.3)

In [38]:
big_data = transform(big_data)

In [39]:
len(big_data)

7

In [40]:
class FocalLoss(torch.nn.modules.loss._WeightedLoss):
    def __init__(self, weight=None, gamma=2,reduction='mean'):
        super(FocalLoss, self).__init__(weight,reduction=reduction)
        self.gamma = gamma
        self.weight = weight # weight parameter will act as the alpha parameter to balance class weights

    def forward(self, input, target):

        ce_loss = F.cross_entropy(input, target, reduction=self.reduction, weight=self.weight)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss

In [51]:
import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(1234)
        self.conv1 = GCNConv(10, 10, add_self_loops=False, normalize=True)
        #self.conv2 = GCNConv(5, 5, add_self_loops=False, normalize=True)
        self.fc1 = Linear(10, 10)
        self.fc2 = Linear(10, 10)
        self.fc3 = Linear(10, 10)
        self.fc4 = Linear(10, 5)

    def forward(self, x, edge_index, edge_weight):
        h = self.conv1(x, edge_index, edge_weight)
        h = h.relu()
        #h = self.conv2(h, edge_index, edge_weight)
        #h = h.relu()
        h = self.fc1(h)
        h = h.relu()
        h = self.fc2(h)
        h = h.relu()
        h = self.fc3(h)
        h = h.relu()
        h = self.fc4(h)
        return h

In [52]:
model = GCN()
criterion2 = torch.nn.CrossEntropyLoss()
criterion = FocalLoss(weight= 1. / torch.tensor([70, 463, 426, 2177, 631], dtype=torch.float))

In [53]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

def train(epoch):
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(big_data.x, big_data.edge_index, big_data.edge_attr)  # Perform a single forward pass.
    loss = criterion2(out[big_data.train_mask], big_data.y[big_data.train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    experiment.log_metrics({"loss": loss}, epoch=epoch)
    return loss, criterion2(out[big_data.train_mask], big_data.y[big_data.train_mask])

In [61]:
t = trange(10000, leave=True)

with experiment.train():
    for epoch in t:
        loss, loss2 = train(epoch)
        t.set_description(str(round(loss.item(), 6)) + "___" + str(round(loss2.item(), 6)))

0.544908___0.544908: 100%|███████████████████████████████████████████████████████████████████| 10000/10000 [04:15<00:00, 39.19it/s]


In [62]:
def test():
    model.eval()
    out = model(big_data.x, big_data.edge_index, big_data.edge_attr)
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred[big_data.test_mask] == big_data.y[big_data.test_mask]  # Check against ground-truth labels.
    test_acc = int(test_correct.sum()) / int(big_data.test_mask.sum())  # Derive ratio of correct predictions.
    return test_acc

In [63]:
test()

0.7814159292035399

In [57]:
out = model(data.x, data.edge_index, data.edge_attr)
pred = out.argmax(dim=1)
test_correct = pred[data.test_mask] == data.y[data.test_mask]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (3767x5 and 10x10)

In [533]:
d = {
            'мордвины': 0,
            'белорусы': 1,
            'украинцы': 2,
            'южные-русские': 3,
            'северные-русские': 4,
        }
counts = {0: 70, 1: 463, 2: 426, 3: 2177, 4: 631}

In [534]:
import sklearn.metrics
sklearn.metrics.confusion_matrix(data.y[data.test_mask], pred[data.test_mask], labels=None, normalize=None)

array([[ 23,   0,   0,   0,   2],
       [  9,  82,  16,  21,   9],
       [ 22,   7,  90,   4,   7],
       [ 23,  55,  51, 486,  48],
       [ 12,  12,   6,  19, 126]])

Focal loss

```
array([[ 23,   0,   0,   0,   2],
       [  9,  87,  14,  18,   9],
       [ 22,   9,  89,   2,   8],
       [ 23,  59,  54, 479,  48],
       [ 12,  12,   8,  17, 126]])
 ```
 
 CE loss
 
 ```
 array([[ 18,   0,   0,   5,   2],
       [  0,  67,  10,  56,   4],
       [  0,   4,  69,  50,   7],
       [  1,  17,  16, 601,  28],
       [  1,   7,   4,  46, 117]])
```

In [266]:
import numpy as np

In [269]:
W1 = np.array([[0, 0, 0, 0, 0], 
               [0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0],
               [1, 1, 1, 1, 1]])

X = np.array([[1, 2, 3, 4],
              [1, 2, 3, 4],
              [1, 2, 3, 4],
              [1, 2, 3, 4],
              [1, 2, 3, 4]])

In [272]:
np.dot(W1, X)

array([[ 0,  0,  0,  0],
       [ 0,  0,  0,  0],
       [ 0,  0,  0,  0],
       [ 0,  0,  0,  0],
       [ 5, 10, 15, 20]])

In [None]:
W1 = np.array([[1, 0, 0, 0, 0], 
               [0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0],
               [0, 0, 0, 0, 0],
               [1, 1, 1, 1, 1]])

X = np.array([[1, 2, 3, 4],
              [1, 2, 3, 4],
              [1, 2, 3, 4],
              [1, 2, 3, 4],
              [1, 2, 3, 4]])

In [274]:

m = GCN()
print(m)

out, h = m(data.x, data.edge_index, data.edge_attr)

GCN(
  (conv1): GCNConv(5, 16)
  (conv2): GCNConv(16, 5)
)


ValueError: too many values to unpack (expected 2)

In [306]:
inp = torch.tensor([[0.1, 0.8, 0.1]], dtype=torch.float)

In [305]:
inp.shape

torch.Size([1, 3])

In [300]:
# Example of target with class indices
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()
# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input, target)
output.backward()

NameError: name 'nn' is not defined

In [316]:
criterion = torch.nn.CrossEntropyLoss()
input = torch.tensor([[3.2, 1.3,0.2, 0.8], 
                     [3.2, 1.3,0.2, 0.8],
                     [3.2, 1.3,0.2, 0.8]],dtype=torch.float)
target = torch.tensor([0, 0, 0], dtype=torch.long)
criterion(input, target)

tensor(0.2547)

In [313]:
input.softmax(dim=1)

tensor([[0.7751, 0.1159, 0.0386, 0.0703],
        [0.7751, 0.1159, 0.0386, 0.0703]])

In [78]:
data_csv = pd.read_csv("data/raw/CR_graph_rel.csv")

In [79]:
data_csv

Unnamed: 0,node_id1,node_id2,label_id1,label_id2,ibd_sum,ibd_n
0,node_0,node_5,мордвины,мордвины,29.81720,4
1,node_0,node_10,мордвины,мордвины,11.63220,1
2,node_0,node_11,мордвины,мордвины,23.90440,2
3,node_0,node_18,мордвины,мордвины,11.25290,1
4,node_0,node_20,мордвины,мордвины,8.88252,1
...,...,...,...,...,...,...
67498,node_3741,node_3752,белорусы,белорусы,9.51327,1
67499,node_3745,node_3755,белорусы,белорусы,9.23221,1
67500,node_3749,node_3764,белорусы,белорусы,10.63310,1
67501,node_3754,node_3755,украинцы,белорусы,8.04722,1


In [184]:
edge_index

tensor([[0],
        [5]])

In [187]:
d = {
            'мордвины': torch.tensor(0, dtype=torch.long),
            'белорусы': torch.tensor(1, dtype=torch.long),
            'украинцы': torch.tensor(2, dtype=torch.long),
            'южные-русские': torch.tensor(3, dtype=torch.long),
            'северные-русские': torch.tensor(4, dtype=torch.long)
        }



edge_index = []
edge_attr = []

y_labels = {}
oho_labels = {}

for index, row in tqdm(data_csv.iterrows()):
    node1 = row["node_id1"]
    node2 = row["node_id2"]
    label1 = row["label_id1"]
    label2 = row["label_id2"]
    ibd_sum = row["ibd_sum"]

    id1 = int(node1[5:])
    id2 = int(node2[5:])

    edge_index.append(torch.tensor([id1, id2], dtype=torch.long))
    edge_attr.append(torch.tensor([ibd_sum], dtype=torch.float))

    if id1 not in oho_labels:
        y_labels[id1] = d[label1]
    if id2 not in oho_labels:
        y_labels[id2] = d[label2]

y_labels = dict(sorted(y_labels.items()))
y = torch.stack(list(y_labels.values()))

edge_attr = torch.stack(edge_attr).contiguous()
edge_index = torch.stack(edge_index).t().contiguous()

67503it [00:13, 5057.07it/s]


In [192]:
from collections import defaultdict
counts = defaultdict(int)

for l in y_labels:
    counts[y_labels[l].item()] += 1 

In [193]:
counts

defaultdict(int, {0: 70, 1: 463, 2: 426, 3: 2177, 4: 631})

In [176]:
ibd_sum = row["ibd_sum"]

KeyError: 'ibd_sum'

In [163]:
from collections import defaultdict

d = defaultdict(int)
ibd_s = defaultdict(list)
connections = np.zeros([3767, 6], dtype=int)

nations = {
            'мордвины': 0,
            'белорусы': 1,
            'украинцы': 2,
            'южные-русские': 3,
            'северные-русские': 4,
        }

for index, row in tqdm(data_csv.iterrows()):
    node1 = row["node_id1"]
    node2 = row["node_id2"]
    label1 = row["label_id1"]
    label2 = row["label_id2"]
    ibd_sum = row["ibd_sum"]    
    
    ibd_ = []
    
    key = label1 + "_" + label2
    reverse_key = label2 + "_" + label1
    
    if reverse_key in d:
        d[reverse_key] += 1
        ibd_s[reverse_key].append(ibd_sum)
    else:
        d[key] += 1
        ibd_s[key].append(ibd_sum)
    
    id1 = int(node1[5:])
    id2 = int(node2[5:])
    
    connections[id1][nations[label2]+1] += ibd_sum
    connections[id2][nations[label1]+1] += ibd_sum
    connections[id1][0] = nations[label1]
    connections[id2][0] = nations[label2]

67503it [00:04, 14309.85it/s]


In [107]:
for key in ibd_s:
    print(key, round(np.mean(ibd_s[key]), 2), round(np.std(ibd_s[key]), 2))

мордвины_мордвины 20.73 20.28
мордвины_южные-русские 9.88 2.59
мордвины_северные-русские 10.91 4.22
мордвины_белорусы 9.56 1.95
мордвины_украинцы 9.54 1.63
белорусы_северные-русские 9.91 2.47
белорусы_южные-русские 9.99 2.57
белорусы_белорусы 27.42 235.24
белорусы_украинцы 10.1 8.05
украинцы_южные-русские 9.93 2.62
украинцы_северные-русские 9.89 2.28
украинцы_украинцы 36.6 274.93
южные-русские_южные-русские 14.5 104.91
южные-русские_северные-русские 9.96 2.54
северные-русские_северные-русские 12.52 72.48


In [121]:
d

defaultdict(int,
            {'мордвины_мордвины': 268,
             'мордвины_южные-русские': 394,
             'мордвины_северные-русские': 256,
             'мордвины_белорусы': 42,
             'мордвины_украинцы': 38,
             'белорусы_северные-русские': 1851,
             'белорусы_южные-русские': 7144,
             'белорусы_белорусы': 1759,
             'белорусы_украинцы': 1085,
             'украинцы_южные-русские': 4902,
             'украинцы_северные-русские': 1137,
             'украинцы_украинцы': 1231,
             'южные-русские_южные-русские': 33241,
             'южные-русские_северные-русские': 10225,
             'северные-русские_северные-русские': 3930})

In [164]:
person_nation_connections = defaultdict(list)
for i in range(len(connections)):
    person_nation_connections[connections[i][0]].append(connections[i][1:])

person_nation_connections_mean = {}
for key in person_nation_connections: 
    person_nation_connections_mean[key] = np.mean(np.array(person_nation_connections[key]), axis=0)

In [165]:
person_nation_connections_mean

{0: array([154.97142857,   5.47142857,   4.97142857,  53.1       ,
         38.24285714]),
 1: array([  0.82721382, 204.825054  ,  22.64146868, 147.30669546,
         37.81857451]),
 2: array([  0.81690141,  24.60798122, 208.88262911, 109.11502347,
         25.21596244]),
 3: array([  1.7073955 ,  31.32889297,  21.35186036, 428.99954065,
         44.69637115]),
 4: array([  4.24247227,  27.7496038 ,  17.02377179, 154.20602219,
        150.18700475])}

In [129]:
x[0][0] += 1

In [133]:
l = [1, 2, 3]

In [134]:
l[0] = 10

In [135]:
l

[10, 2, 3]

In [128]:
np.zeros([3767, 5])

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])