# What is it about

### Transformer. Notebook trains bunch of transformers with different lengths. Here are my test results on the biggest element.


| n_size | path_length | n_ideal | train_time (min) | search_time (sec) |
|--------|-------------|-------|------------------|-------------------|
| 2      | 1           | 1     | 83               | 40                |
| 3      | 1           | 3     | 83               | 60                |
| 4      | 6           | 4     | 83               | 110               |
| 5      | 10          | 10    | 83               | 150               |
| 6      | 15          | 15    | 83               | 150               |
| 7      | 21          | 21    | 83               | 320               |
| 8      | 45          | 28    | 83               | 400               |
| 9      | 107         | 36    | 416              | 516               |
| 10     | false       | 45    | 416              | >10000            |
| 11     | false       | 55    | 416              | >10000            |
| 12     | false       | 66    | 416              | >10000            |
| 13     | 332         | 78    | 900              | 1281              |
| 14     | false       | 91    | 900              | >10000            |
| 15     | 361         | 105   | 900              | 5698              |
| 20     | false       | 190   | 2000             | >10000            |

### About my model: I use 4 versions of a model (last column shows for which n_sizes I used it): 
| Version | proj_dim | n_layers | num_epochs | data_per_epoch | ns      |
|---------|----------|----------|------------|----------------|---------|
| 1       | 128      | 16       | 50         | 100_000        | 2-9     |
| 2       | 512      | 32       | 100        | 100_000        | 9-13    |
| 3       | 1024     | 16       | 100        | 200_000        | 13-19   |
| 4       | 1024     | 32       | 100        | 200_000        | 19 & 20 |

### Constants: beam_width: 2**11, max_length: 1500, batch_size: 32, n_heads: 4 

In [None]:
!git clone https://github.com/iKolt/cayleypy.git

In [None]:
from cayleypy.cayleypy import *

from tqdm import notebook as tqdm_notebook

import pandas as pd
import math
import numpy as np

import matplotlib.pyplot as plt

In [None]:
CFG = { 'dimension'               : 15, # N
        'generators_family'       : 'S_N',
        ##################################################################
        
    
        #'group_and_generators_id' : 'cube_3/3/3_12gensQTM',#'cube_4/4/4',
        #'flag_add_inverses'       :  True                 , 
        'range_of_cubies'         :  20                  ,

       
        ### adavanced beam search params
        'beam_width'                      : 2**10 , 
        'n_steps_limit'                   : 1500  ,
        'n_step_size'                     : 1     , 
        'n_beam_candidate_states'         : 'Auto',
        'radius_destination_neigbourhood' : 0     ,
        'radius_beam_neigbourhood'        : 0     , # not used actually
        'bi_bfs_chunk_size_states'        : 2**16 , # not used actually
        'mode_bibfs_checks'               : (10,5), # not used actually
        'temperature'                     : 0     ,
        'temperature_decay'               : 1     ,
        'alpha_past_states_attenuation'   : 0     ,
        'n_steps_to_ban_backtracking'     : 0     ,
        'flag_empty_backtracking_list'    : False ,
        'n_attempts_limit'                : 1     ,
        'do_check_stagnation'             : False , 
        'stagnation_steps'                : 12    ,
        'stagnation_thres'                : 0.005 ,
        'n_random_start_steps'            : 0     ,
        'diversity_func'                  : hamming_dist,
        'diversity_weight'                : 0     ,
        'sub_ray_split'                   : False ,
        #'random_seed'                     : -23057572
      }

In [None]:
%%time
list_groups_generators = ['cube_2/2/2_6gensQTM', 'cube_3/3/3_12gensQTM', 'cube_2/2/2', 'cube_3/3/3', 'cube_4/4/4','cube_5/5/5',
'cube_6/6/6', 'cube_7/7/7', 'cube_8/8/8', 'cube_9/9/9', 'cube_10/10/10', 'cube_19/19/19', 'cube_33/33/33',
'wreath_100/100', 'wreath_33/33', 'wreath_21/21', 'wreath_12/12', 'wreath_7/7', 'wreath_6/6', 'globe_1/8',
'globe_1/16' ,'globe_2/6' ] 

def get_dict_generators( group_gens_id, flag_add_inverses = True  ):

    if group_gens_id == 'cube_2/2/2_6gensQTM':
        dict_allowed_moves  = {\
        'f0':[ 0,  1, 19, 17,  6,  4,  7,  5,  2,  9,  3, 11, 12, 13, 14, 15, 16, 20, 18, 21, 10,  8, 22, 23],
        '-f0':[ 0,  1,  8, 10,  5,  7,  4,  6, 21,  9, 20, 11, 12, 13, 14, 15, 16,  3, 18,  2, 17, 19, 22, 23],
        'r1':[ 0,  5,  2,  7,  4, 21,  6, 23, 10,  8, 11,  9,  3, 13,  1, 15, 16, 17, 18, 19, 20, 14, 22, 12],
        '-r1':[ 0, 14,  2, 12,  4,  1,  6,  3,  9, 11,  8, 10, 23, 13, 21, 15, 16, 17, 18, 19, 20,  5, 22,  7],
        'd0':[ 0,  1,  2,  3,  4,  5, 18, 19,  8,  9,  6,  7, 12, 13, 10, 11, 16, 17, 14, 15, 22, 20, 23, 21],
        '-d0':[ 0,  1,  2,  3,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13, 18, 19, 16, 17,  6,  7, 21, 23, 20, 22]}
    elif group_gens_id ==  'cube_3/3/3_12gensQTM':
        dict_allowed_moves  = {\
        'U': [6, 3, 0, 7, 4, 1, 8, 5, 2, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 47, 21, 22, 50, 24, 25, 53, 27, 28, 38, 30, 31, 41, 33, 34, 44, 36, 37, 20, 39, 40, 23, 42, 43, 26, 45, 46, 29, 48, 49, 32, 51, 52, 35], 
        'D': [0, 1, 2, 3, 4, 5, 6, 7, 8, 15, 12, 9, 16, 13, 10, 17, 14, 11, 36, 19, 20, 39, 22, 23, 42, 25, 26, 45, 28, 29, 48, 31, 32, 51, 34, 35, 27, 37, 38, 30, 40, 41, 33, 43, 44, 18, 46, 47, 21, 49, 50, 24, 52, 53],
        'L': [44, 43, 42, 3, 4, 5, 6, 7, 8, 45, 46, 47, 12, 13, 14, 15, 16, 17, 24, 21, 18, 25, 22, 19, 26, 23, 20, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 11, 10, 9, 0, 1, 2, 48, 49, 50, 51, 52, 53],
        'R': [0, 1, 2, 3, 4, 5, 51, 52, 53, 9, 10, 11, 12, 13, 14, 38, 37, 36, 18, 19, 20, 21, 22, 23, 24, 25, 26, 33, 30, 27, 34, 31, 28, 35, 32, 29, 8, 7, 6, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 15, 16, 17],
        'B': [0, 1, 35, 3, 4, 34, 6, 7, 33, 20, 10, 11, 19, 13, 14, 18, 16, 17, 2, 5, 8, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 9, 12, 15, 42, 39, 36, 43, 40, 37, 44, 41, 38, 45, 46, 47, 48, 49, 50, 51, 52, 53],
        'F' : [24, 1, 2, 25, 4, 5, 26, 7, 8, 9, 10, 27, 12, 13, 28, 15, 16, 29, 18, 19, 20, 21, 22, 23, 17, 14, 11, 6, 3, 0, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 51, 48, 45, 52, 49, 46, 53, 50, 47] ,
        "U'" : [2, 5, 8, 1, 4, 7, 0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 38, 21, 22, 41, 24, 25, 44, 27, 28, 47, 30, 31, 50, 33, 34, 53, 36, 37, 29, 39, 40, 32, 42, 43, 35, 45, 46, 20, 48, 49, 23, 51, 52, 26] ,
        "D'" : [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 17, 10, 13, 16, 9, 12, 15, 45, 19, 20, 48, 22, 23, 51, 25, 26, 36, 28, 29, 39, 31, 32, 42, 34, 35, 18, 37, 38, 21, 40, 41, 24, 43, 44, 27, 46, 47, 30, 49, 50, 33, 52, 53] ,
        "L'" : [45, 46, 47, 3, 4, 5, 6, 7, 8, 44, 43, 42, 12, 13, 14, 15, 16, 17, 20, 23, 26, 19, 22, 25, 18, 21, 24, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 2, 1, 0, 9, 10, 11, 48, 49, 50, 51, 52, 53] ,
        "R'" : [0, 1, 2, 3, 4, 5, 38, 37, 36, 9, 10, 11, 12, 13, 14, 51, 52, 53, 18, 19, 20, 21, 22, 23, 24, 25, 26, 29, 32, 35, 28, 31, 34, 27, 30, 33, 17, 16, 15, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 6, 7, 8] ,
        "B'" : [0, 1, 18, 3, 4, 19, 6, 7, 20, 33, 10, 11, 34, 13, 14, 35, 16, 17, 15, 12, 9, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 8, 5, 2, 38, 41, 44, 37, 40, 43, 36, 39, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53] ,
        "F'" : [29, 1, 2, 28, 4, 5, 27, 7, 8, 9, 10, 26, 12, 13, 25, 15, 16, 24, 18, 19, 20, 21, 22, 23, 0, 3, 6, 11, 14, 17, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 47, 50, 53, 46, 49, 52, 45, 48, 51],
        }                               
    else:
        fn = '/growth-in-finite-groups/puzzle_info.csv'
        di = pd.read_csv(fn)
        m = di['puzzle_type'] == group_gens_id
        allowed_moves = di['allowed_moves'][m].iat[0]
        allowed_moves = str( allowed_moves ).replace("'", '"')
        import json
        dict_allowed_moves = json.loads(allowed_moves)
        #print(str(dict_allowed_moves)[:500] )
        
#         vec_len = len( dict_allowed_moves[list(dict_allowed_moves.keys())[0]])
#         array_allowed_moves = np.zeros(  (len(dict_allowed_moves),  vec_len), dtype = int )
#         for i in range(len(dict_allowed_moves)):
#             array_allowed_moves[i,:] = dict_allowed_moves[list(dict_allowed_moves.keys())[i]]
#         array_allowed_moves    

    if not flag_add_inverses:
        return dict_allowed_moves
        
        
    def inverse_permutation(perm):
        # Create an empty list to hold the inverse permutation
        inverse = [0] * len(perm)
        # Iterate over the original permutation
        for i, p in enumerate(perm):
            # Place the index at the correct position in the inverse permutation
            inverse[p] = i
        return inverse        
    
    dict_allowed_moves_with_inverses = dict_allowed_moves.copy()
    for k1 in dict_allowed_moves:
        perm1 = dict_allowed_moves[k1]
        flag_inverse_found = False
        for k2 in dict_allowed_moves:
            perm2 = dict_allowed_moves[k2]
            if perm1 == inverse_permutation(perm2):
                flag_inverse_found = True
                break
        if not flag_inverse_found:
            dict_allowed_moves_with_inverses['-'+k1] = inverse_permutation(perm1)
            
    return dict_allowed_moves_with_inverses


# group_gens_id = 'cube_2/2/2_6gensQTM' 
# group_gens_id = 'cube_2/2/2'#_6gensQTM' 
# group_gens_id = CFG['group_and_generators_id']#'cube_3/3/3_12gensQTM'
# dict_generators = get_dict_generators( group_gens_id, flag_add_inverses = True  )
# list_generators = list(dict_generators.values())
# print('Geneators number:', len( dict_generators.keys()) , 'state_size:', len( list_generators[0] ))
# print('Geneators names:', dict_generators.keys() )
# print( str(dict_generators)[:2000] )

In [None]:
def make_S_N(N):
    return [
        [1,0,]+[q+2 for q in range(N-2)]       ,
               [q+1 for q in range(N-1)] + [0,],
        [N-1,]+[q+0 for q in range(N-1)]       ,
    ]

def make_permutohedron(N):
    return [
        list(range(0,i)) + [i+1,i,] + list(range(i+2,N)) for i in range(N-1)
    ]# + [[N-1,]+list(range(1,N-1))+[0,]]

def make_all_pair_perms(N):
    res = []

    for i in range(N):
        for j in range(N):
            candidate = list(range(N))
            if i!=j:
                candidate[i] = j
                candidate[j] = i
                res.append(tuple(candidate))
    
    return res

def make_pancake(N):
    return [
        list(range(i,-1,-1)) + list(range(i+1, N)) for i in range(1,N)
    ]

In [None]:
families = {'S_N':make_S_N,
            'PHD':make_permutohedron,
            'PRS':make_all_pair_perms,
            'PNC':make_pancake,}

In [None]:
list_generators = families[CFG['generators_family']](CFG['dimension'])

cayley_group_ex = CayleyGraph( list_generators, to_power=1 )#, random_seed = 4073572258)

In [None]:
cayley_group_ex.manhatten_moves_matrix_count(steps = 1000, to_power=1.6)

In [None]:
metric = lambda x: hamming_dist(x, cayley_group_ex.state_destination)

In [None]:
states_to_solve = list(reversed([cayley_group_ex.scramble_state(100+q%2) for q in range(CFG['range_of_cubies'])]))

In [None]:
cayley_group_ex._vls[0,:]

In [None]:
def solve(sts, model):
    flag_found_destination, i_step, dict_additional_data  =\
            cayley_group_ex.beam_search_permutations_torch(
                state_start                     = sts                                   ,
                models_or_heuristics            = model, #cayley_group_ex.group_data_1,#metric                                ,
                
                beam_width                      = CFG["beam_width"                     ],
                n_steps_limit                   = CFG["n_steps_limit"                  ],
                n_step_size                     = CFG["n_step_size"                    ],
                n_beam_candidate_states         = CFG["n_beam_candidate_states"        ],
                radius_destination_neigbourhood = CFG["radius_destination_neigbourhood"],
                radius_beam_neigbourhood        = CFG["radius_beam_neigbourhood"       ],
                bi_bfs_chunk_size_states        = CFG["bi_bfs_chunk_size_states"       ],
                mode_bibfs_checks               = CFG["mode_bibfs_checks"              ],
                temperature                     = CFG["temperature"                    ],
                alpha_past_states_attenuation   = CFG["alpha_past_states_attenuation"  ],
                n_steps_to_ban_backtracking     = CFG["n_steps_to_ban_backtracking"    ],
                flag_empty_backtracking_list    = CFG["flag_empty_backtracking_list"   ],
                n_attempts_limit                = CFG["n_attempts_limit"               ],
                do_check_stagnation             = CFG["do_check_stagnation"            ],
                stagnation_steps                = CFG["stagnation_steps"               ],
                stagnation_thres                = CFG["stagnation_thres"               ],
                n_random_start_steps            = CFG["n_random_start_steps"           ],

                diversity_func                  = CFG['diversity_func'                 ],
                diversity_weight                = CFG['diversity_weight'               ],
                sub_ray_split                   = CFG['sub_ray_split'                  ],

                verbose                         = 1,
                  
              )
    
    return flag_found_destination, i_step, dict_additional_data

In [None]:
def gen_n(n):
    p = np.arange(n)

    p[0], p[1] = p[1], p[0]
    i = 2
    while i < n-i+1:
        print(i, n-i+1)
        p[i], p[n-i+1] = p[n-i+1], p[i]
        i += 1

    return p

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class PatchEncoder(nn.Module):
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        
        self.position_embedding = nn.Embedding(num_patches, projection_dim)
        self.register_buffer('positions', torch.arange(num_patches).unsqueeze(0))
        

        
    def forward(self, patch):
        
        encoded = patch + self.position_embedding(self.positions)
        return encoded
     
    def get_config(self):
        config = {"num_patches": self.num_patches}
        return config

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_units, dropout_rate):
        super(MLP, self).__init__()
        layers = []
        in_dim = input_dim
        
        for units in hidden_units:
            layers.append(nn.Linear(in_dim, units))
            layers.append(nn.GELU())
            layers.append(nn.Dropout(dropout_rate))
            in_dim = units
        
        self.mlp = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.mlp(x)
    
class TransformerBlock(nn.Module):
    def __init__(self, projection_dim, num_heads, transformer_units, dropout_rate=0.01):
        super(TransformerBlock, self).__init__()
        self.norm1 = nn.LayerNorm(projection_dim, eps=1e-6)
        self.attention = nn.MultiheadAttention(projection_dim, num_heads, dropout=dropout_rate)
        self.norm2 = nn.LayerNorm(projection_dim, eps=1e-6)
        self.mlp = MLP(projection_dim, transformer_units, dropout_rate)
    
    def forward(self, x):
        x1 = self.norm1(x)
        attn_output, _ = self.attention(x1, x1, x1)
        x2 = x + attn_output
        x3 = self.norm2(x2)
        x3 = self.mlp(x3)
        return x2 + x3

In [None]:
class AdvancedTransformerModel(nn.Module):
    def __init__(self, color_size, projection_dim, num_heads, num_layers, dropout_rate=0.1):
        super(AdvancedTransformerModel, self).__init__()
        
        self.projection_dim = projection_dim
        self.embedding = nn.Linear(color_size, projection_dim)      
        self.positional_encoder = PatchEncoder(color_size, projection_dim)
        self.transformer_units = [projection_dim * 2, projection_dim]
        self.transformer_blocks = nn.ModuleList([TransformerBlock(projection_dim, num_heads, self.transformer_units) for _ in range(num_layers)])                
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        
        self.dense_v1 = nn.Linear(projection_dim, projection_dim)
        self.dense_v2 = nn.Linear(projection_dim, projection_dim)
        self.dense_v3 = nn.Linear(projection_dim, 1)
        
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):

        if x.dtype == torch.uint8:
            x = x.to(torch.float32)

        x = self.embedding(x).unsqueeze(1)
        x = self.positional_encoder(x)
        
        for block in self.transformer_blocks:
            x = block(x)
                
        x = x.permute(0, 2, 1)
        
        x = self.global_avg_pool(x).squeeze(-1)
        
        v = self.dense_v1(x)
        v = self.dropout(v)
        v = self.dense_v2(v)
        v = self.dropout(v)
        v = self.dense_v3(v)
        
        return v.flatten()

In [None]:
def trainer(model, dataset_size, n_epochs, once=32, verbose=True, n=0):
    model.train()

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-6)

    losses = []
   
    for epoch in range(n_epochs):
        running_loss = 0.0
        mse_sum = 0.0

        # if epoch % 50 == 0:
        #     checkpoint = {
        #         'epoch': epoch,               
        #         'model_state_dict': model.state_dict(),
        #         'optimizer_state_dict': optimizer.state_dict(), 
        #         'loss': criterion,                       
        #     }
            
        #     torch.save(checkpoint, 'checkpoint.pth')

        X, y = cayley_group_ex.random_walks(CFG['n_steps_limit'], dataset_size//200)

        for batch in range(dataset_size//once):
            
            X_, y_ = X[batch*once:(batch+1)*once].float().to(device), y[batch*once:(batch+1)*once].float().to(device)

            preds = model(X_)
            loss = criterion(preds, y_)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            mse_sum += ((preds - y_) ** 2).sum().item()

        if verbose == True:
            train_loss = running_loss / dataset_size
            train_mse = mse_sum / dataset_size
            train_rmse = train_mse ** 0.5

            print(f'Epoch {epoch}/{n_epochs}: loss: {train_loss};')
            print(f'MSE: {train_mse} RMSE: {train_rmse}')
            print()

            losses.append(train_loss)

    if verbose == True:

        checkpoint = {
            'epoch': epoch,               
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(), 
            'loss': criterion,                       
        }
        
        torch.save(checkpoint, f'checkpoint_{n}.pth')
    
        del X
        del y
        del optimizer
        del criterion
        del loss
        del train_loss
        del train_mse
        del train_rmse

        torch.cuda.empty_cache()
                
    return model, losses

In [None]:
ns = [13,14,15,16,17,18,19]

ns1 = [2,3,4,5,6,7,8]
ns2 = [9,10,11,12]
ns3 = [13,14,15,16,17,18]
ns4 = [19,20]

# Groups of ns: 1-6, 7-12, 13-18, 19-20
params = [[128, 16, 50, 100_000], [512, 32, 100, 100_000], [1024, 16, 100, 200_000], [1024, 32, 100, 200_000]]

for n in ns:
    list_generators = families[CFG['generators_family']](n)

    cayley_group_ex = CayleyGraph( list_generators, to_power=1 )
    metric = lambda x: hamming_dist(x, cayley_group_ex.state_destination)
    
    once = 32

    if n in ns1:
        param = params[0]
        once *= 2
    elif n in ns2:
        param = params[1]
    elif n in ns3:
        param = params[2] 
        once *= 2
    elif n in ns4:
        param = params[3]

    model = AdvancedTransformerModel(color_size = n, projection_dim = param[0], num_heads = 4, num_layers = param[1], dropout_rate=0.00).to(device)
    model, losses = trainer(model, param[3], param[2], once=once, verbose=True, n=n)

    flag_found_destination, i_step, dict_additional_data = solve(gen_n(n), model)

    with open(f'result_{n}.txt', 'w') as f:
        f.write(str(flag_found_destination))
        f.write('\n')
        f.write(str(i_step))
        f.write('\n')
        f.write(str(dict_additional_data))
    
    fig, ax = plt.subplots( nrows=1, ncols=1 )  
    ax.plot(losses)
    fig.savefig(f'result_{n}.png') 
    plt.close(fig) 

    del flag_found_destination
    del i_step
    del dict_additional_data
    del model
    del losses
    
    import gc
    gc.collect()
    
    torch.cuda.empty_cache()