In [1]:
import pickle
import numpy as np

from ctokenizer import CTokenizer
from cgru import CGRU

num_classes = 104
num_tokens = 2**15
gru_emb_dim = 96

with open("artifacts/svc_merger.pkl", "rb") as file:
   svc = pickle.load(file)

In [10]:
def normalize_sparse_mat(x):
    row_sum = x.sum(axis=1)
    x.data = x.data / np.repeat(row_sum, np.diff(x.indptr))
    return x


In [2]:
from scipy.sparse import load_npz, save_npz, csr_array, hstack
import numpy as np

def split_mat(mat, split):
    left_data = mat.data[mat.indices < split]
    right_data = mat.data[mat.indices >= split]
    left_indices = mat.indices[mat.indices < split]
    right_indices = mat.indices[mat.indices >= split] - split

    count_left, count_right = [0], [0]

    for i in range(mat.shape[0]):
        indcices = mat.indices[mat.indptr[i]:mat.indptr[i+1]]
        count_left.append((indcices < split).sum())
        count_right.append((indcices >= split).sum())

    indptr_left = np.cumsum(count_left)
    indptr_right = np.cumsum(count_right)

    left = csr_array((left_data, left_indices, indptr_left), shape=(mat.shape[0], split))
    right = csr_array((right_data, right_indices, indptr_right), shape=(mat.shape[0], mat.shape[1] - split))

    return left, right

def normalize_sparse_mat(x):
    row_sum = x.sum(axis=1)
    x.data = x.data / np.repeat(row_sum, np.diff(x.indptr))
    return x

mat = load_npz("artifacts/svc_val.npz")
mat = mat.tocsr()

left, right = split_mat(mat, 2**15)
del mat

left = normalize_sparse_mat(left)
mat = hstack((left, right))

save_npz("artifacts/svc_val_norm.npz", mat)


In [5]:
mat.mean()

8.529913670037985e-05

: 

<339578x32864 sparse matrix of type '<class 'numpy.float64'>'
	with 82855476 stored elements in Compressed Sparse Row format>

array([    3,     7,    12, ..., 32861, 32862, 32863], dtype=int32)

In [None]:
for data

In [1]:
from gh_dataset import GHDataset
import pickle

train_data = GHDataset(split="train_svc")
val_data = GHDataset(split="test")

train_labels = [label.value - 1 for label in train_data.labels]
val_labels = [label.value - 1 for label in val_data.labels]

FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/svc_labels.pkl'

In [None]:

with open("artifacts/svc_labels.pkl", "wb") as file:
    pickle.dump((train_labels, val_labels), file)

In [4]:
np.save("svc_train_val_labels.npy", {"train": train_labels, "val": val_labels}, allow_pickle=True)

In [88]:
tokenizer = CTokenizer()
gru = CGRU()

text = """for i in range(100):
    print("kek")"""

tokens = tokenizer(text)

In [89]:
text

'for i in range(100):\n    print("kek")'

In [90]:
print(tokens)

[1065, 301, 321, 2456, 12, 1162, 917, 203, 262, 1196, 524, 464, 79, 576]


In [92]:
ids, counts = np.unique(tokens, return_counts=True)

In [93]:
last_state = gru.get_last_state(tokens)

x = np.zeros(2**15)
x[ids] = counts
x /= x.sum()

x = np.hstack((x, last_state))

In [94]:
Languages(svc.predict(x.reshape(1, -1)) + 1)

<Languages.PYTHON: 74>

In [59]:
(gru_features_weights @ last_state).shape

(104,)

In [99]:
score = token_weights[tokens].sum(axis=0)
# score /= len(tokens)
# score += gru_features_weights @ last_state

In [101]:
token_weights[1065]

array([ 5.48500953e-01, -1.07822231e+01, -1.58333587e-01, -5.83662514e-01,
        1.90037416e-03, -5.46997611e-01, -1.07822231e+01, -2.68381379e-01,
       -3.40238533e-01, -2.01642212e+00, -2.03740512e+00,  8.89330039e-01,
        1.79067743e+00,  1.47078896e+00,  2.08320156e-01, -2.16940025e+00,
        6.80515389e-01, -2.69705338e-01,  2.52905594e-01,  2.19625976e+00,
       -1.07822231e+01,  1.25748199e+00, -6.61646577e-01, -1.04252459e+00,
        9.13284600e-01, -2.19535983e-01, -1.12414521e+00,  1.86940772e-01,
        3.76554498e-02, -2.20717361e-01,  1.34327674e+00, -7.21737572e-01,
       -2.15448030e+00, -1.07822231e+01, -3.20463681e-01, -2.12290299e+00,
       -1.07822231e+01, -1.07822231e+01, -6.93930102e-01, -3.82528612e-01,
       -4.58171088e-01, -3.89225255e-01, -1.29412178e+00, -2.07791780e+00,
        1.74651458e+00, -1.07822231e+01, -8.45997495e-01, -5.55498250e-01,
        2.38589548e-01, -2.65539825e+00,  5.81392997e-02, -7.48759484e-01,
       -1.07822231e+01,  

In [100]:
score

array([ -10.26332766, -150.9511239 ,  -11.55078396,   -9.10363546,
         -9.32637723,  -22.77868286, -150.9511239 ,   -4.94420063,
         -9.20383207,  -28.02293762,  -17.5098908 ,  -13.28929685,
         -9.8904597 ,  -20.04028608,   -9.18860458,  -21.73361308,
        -13.77408641,  -12.68605762,  -11.768051  ,  -19.91190161,
       -150.9511239 ,  -16.46717219,  -10.64813372,  -18.61917764,
        -20.03647731,  -12.75203262,  -12.39543413,  -17.87998878,
         -4.73554725,  -10.18622961,  -20.90926641,  -10.3586582 ,
        -15.59625103, -150.9511239 ,  -12.94920233,  -16.18990732,
       -150.9511239 , -150.9511239 ,  -10.44885423,  -17.81005333,
         -7.00323279,   -9.6785614 ,  -12.69701738,  -18.99608558,
        -11.26999844, -150.9511239 ,  -16.3356461 ,  -24.94287704,
        -10.97731486,  -23.88760445,  -17.21532162,  -16.39572142,
       -150.9511239 ,  -12.33756946,  -15.31289007,   -6.03186063,
       -150.9511239 ,  -16.90670116,  -14.64769406,  -12.33648

In [98]:
score.argmax()

73

In [42]:
from languages_list import Languages

Languages(74)

<Languages.PYTHON: 74>

In [45]:
svc.classes_

array([ 0,  2,  3,  4,  5,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 38, 39,
       40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 53, 54, 55, 57, 58, 59,
       60, 61, 62, 63, 64, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78,
       79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
       96, 97, 98, 99])

In [2]:
token_weights = np.ones((num_tokens, 104)) * svc.coef_.min()
gru_features_weights = np.zeros((104, gru_emb_dim))

token_weights[:, svc.classes_] = svc.coef_[:, :num_tokens].T
gru_features_weights[svc.classes_] = svc.coef_[:, num_tokens:]

In [6]:
svc.coef_[:, 0]

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -4.23516474e-21,  0.00000000e+00,
        0.00000000e+00,  3.13402190e-19,  1.69406589e-21,  2.07523072e-20,
       -3.69161185e-03, -7.46258204e-04,  0.00000000e+00,  0.00000000e+00,
       -3.38813179e-21,  0.00000000e+00,  4.23516474e-20,  0.00000000e+00,
        0.00000000e+00, -9.69789842e-03, -8.47032947e-22, -4.23516474e-22,
       -2.54109884e-21,  0.00000000e+00,  2.03287907e-20,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -2.20228566e-20,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -1.69406589e-20, -5.59041745e-20,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        3.17637355e-21,  0.00000000e+00, -1.00702112e-06,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -3.55753838e-20,  0.00000000e+00,
       -1.82389953e-02,  

In [3]:
token_weights[0]

array([ 0.00000000e+00, -1.07822231e+01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -1.07822231e+01,  0.00000000e+00,
       -4.23516474e-21,  0.00000000e+00,  0.00000000e+00,  3.13402190e-19,
        1.69406589e-21,  2.07523072e-20, -3.69161185e-03, -7.46258204e-04,
        0.00000000e+00,  0.00000000e+00, -3.38813179e-21,  0.00000000e+00,
       -1.07822231e+01,  4.23516474e-20,  0.00000000e+00,  0.00000000e+00,
       -9.69789842e-03, -8.47032947e-22, -4.23516474e-22, -2.54109884e-21,
        0.00000000e+00,  2.03287907e-20,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -1.07822231e+01,  0.00000000e+00, -2.20228566e-20,
       -1.07822231e+01, -1.07822231e+01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -1.69406589e-20, -5.59041745e-20,
        0.00000000e+00, -1.07822231e+01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  3.17637355e-21,
       -1.07822231e+01,  

In [103]:
with open("solution/resources/svc_weights.bin", "wb") as file:
    file.write(num_classes.to_bytes(length=4, byteorder="little"))
    file.write(num_tokens.to_bytes(length=4, byteorder="little"))
    file.write(gru_emb_dim.to_bytes(length=4, byteorder="little"))

    file.write(token_weights.astype(np.float32).tobytes())
    file.write(gru_features_weights.astype(np.float32).tobytes())