In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import matplotlib.pyplot as plt
import torchtext

import tiktoken
import pandas as pd
import numpy as np

from torch.nn import functional as F
import torch.nn as nn
from tqdm.notebook import tqdm
import pickle
import ast

In [2]:
import os

#dataset https://nlp.stanford.edu/projects/nmt/

#tiktoken api https://github.com/openai/tiktoken
cl100k_base = tiktoken.get_encoding("cl100k_base")

# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
tokenizer = tiktoken.Encoding(
    # If you're changing the set of special tokens, make sure to use a different name
    # It should be clear from the name what behaviour to expect.
    name="cl100k_im",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        "<|PAD|>": 0,
        "<|START|>": 100278,
        "<|END|>": 100279,
        "<|DEL|>": 100280,
        "!": 100281
    }
)
print(tokenizer.n_vocab) #this is the number of tokens in our tokenizer
print(tokenizer._special_tokens) #prints out our special tokens 

specials = {"<|PAD|>","<|START|>","<|END|>", "<|DEL|>", "!"}

100282
{'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|endofprompt|>': 100276, '<|PAD|>': 0, '<|START|>': 100278, '<|END|>': 100279, '<|DEL|>': 100280, '!': 100281}


In [43]:
#GLOBALS

block_size = 32 #This is the value of T
batch_size = 16 #This it the value of B
n_embed = 64
dropout = 0.2
n_heads = 4
n_layers = 6

learning_rate = 3e-4

eval_interval = 500
eval_iters = 200
max_iters = 2000

vocab_size = tokenizer.n_vocab
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device is: {device}")

device is: cuda


In [44]:
def create_dataset():
    num_examples = 100000

    en_max = 0
    en_length = []
    with open(os.getcwd()+'\\data\\train_en.txt', 'r', encoding='utf8') as f:
        sentences_en = []
        for i in tqdm(range(num_examples)):
            line = f.readline()
            line = line.replace("\n", "")
            len_pad = 0
            sentence = "<|PAD|> " + (line) + " <|PAD|>"
            tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
            en_length.append(len(tok_sentence))
            if len(tok_sentence) > en_max:
                en_max = len(tok_sentence)
                print(en_max)

            if len(tok_sentence) <= block_size:
                len_pad = block_size - len(tok_sentence)
                tok_sentence = tok_sentence + len_pad*[0]
                assert len(tok_sentence) == block_size, print(len(tok_sentence))
                sentences_en.append(tok_sentence)
            else:
                sentences_en.append(block_size*[100280])

    en_length = torch.tensor(en_length).float()     
    print(en_max)    
    print(f"Length of sentences: {len(sentences_en)}")

    de_max = 0
    de_length = []
    with open(os.getcwd()+'\\data\\train_de.txt', 'r', encoding='utf8') as f:
        sentences_de = []
        for i in tqdm(range(num_examples)):
            line = f.readline()
            line = line.replace("\n", "")
            len_pad = 0
            sentence = "<|START|> " + (line) + " <|END|>"
            tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
            de_length.append(len(tok_sentence))
            if len(tok_sentence) > de_max:
                de_max = len(tok_sentence)
                print(de_max)

            if len(tok_sentence) <= block_size:
                len_pad = block_size - len(tok_sentence)
                tok_sentence = tok_sentence + len_pad*[0]
                assert len(tok_sentence) == block_size, print(len(tok_sentence))
                sentences_de.append(tok_sentence)
            else:
                sentences_de.append(block_size*[100280])

    de_length = torch.tensor(de_length).float()               
    print(de_max) 
    print(f"Length of sentences: {len(sentences_de)}")

    print("Removing sentences whos length is greater than our block_size")

    #combine the arrays together
    sentences = np.array([sentences_en, sentences_de])
    #check for indices in both sentences that have rows containing the DEL token
    idx = np.where(sentences == 100280)

    #delete every row that contains the DEL token
    sentences = np.delete(sentences, idx[1], axis = 1)

    #splitting to german and english

    sentences_en = torch.tensor(sentences[0], dtype=torch.long)
    sentences_de = torch.tensor(sentences[1], dtype=torch.long)

    print(f"Length of new english sentences: {len(sentences_en)}")
    print(f"Length of new german sentences: {len(sentences_de)}")

    print(f"Average length of english tokenized sentence: {torch.mean(en_length)}, and with std: {torch.std(en_length)}")
    print(f"Average length of german tokenized sentence: {torch.mean(de_length)}, and with std: {torch.std(de_length)}")

    with open(os.getcwd()+'\\data\\english_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_en, f)

    with open(os.getcwd()+'\\data\\german_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_de, f)


In [31]:
create = True
if create:
    create_dataset()

  0%|          | 0/100000 [00:00<?, ?it/s]

38
50
60
67
68
82
95
122
125
132
141
154
177
191
203
295
295
Length of sentences: 100000


  0%|          | 0/100000 [00:00<?, ?it/s]

60
73
83
91
105
120
130
135
161
163
187
239
267
284
284
Length of sentences: 100000
Removing sentences whos length is greater than our block_size
Length of new english sentences: 34865
Length of new german sentences: 34865
Average length of english tokenized sentence: 32.97871017456055, and with std: 17.489957809448242
Average length of german tokenized sentence: 42.1722297668457, and with std: 22.630157470703125


In [32]:
#TRAIN AND VAL DATASETS

with open(os.getcwd()+'\\data\\english_sentences.pkl', 'rb') as f:
    english_sentences = pickle.load(f)

with open(os.getcwd()+'\\data\\german_sentences.pkl', 'rb') as f:
    german_sentences = pickle.load(f)

In [35]:
#BATCH LOADER
n = int(0.9*len(english_sentences))

train_data_en = english_sentences[:n]
val_data_en = english_sentences[n:]

train_data_de = german_sentences[:n]
val_data_de = german_sentences[n:]

def get_batch(split):
    xdata = train_data_en if "train" else train_data_en
    ydata = train_data_de if "train" else val_data_de
    idx = torch.randint(len(xdata), (batch_size,))
    print(idx)
    x = torch.stack([xdata[i] for i in idx])
    t = torch.stack([ydata[i] for i in idx])

    #shifting our targets by 1 to the right
    y = t[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    y = F.pad(input = y, pad = (0,1,0,0), mode = 'constant', value = 0)

    x, y = x.to(device), y.to(device)

    return x, y, t

#xb, yb = get_batch('train')

In [36]:
#Pytorch's positional encoding https://pytorch.org/tutorials/beginner/transformer_tutorial.html
import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
            x: (T, B, C)
            We have to change our shape dimensions in to (T, B, C) and then change it back to (B, T, C) when done

        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [37]:
class Head(nn.Module):
    def __init__(self, head_size, decoder = False):
        super().__init__()
        self.decoder = decoder
        self.head_size = head_size
        self.Wk = nn.Linear(n_embed, head_size)
        self.Wq = nn.Linear(n_embed, head_size)
        self.Wv = nn.Linear(n_embed, head_size)

        if self.decoder:
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        #assume input is of size (B, T, C)
        K = self.Wk(x) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(x) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**(1/2) #(B, T, T)

        if self.decoder:
            attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)

        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)
        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, T) @ (B, T, head_size) = (B, T, head_size)

        return out

class crossHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.Wk = nn.Linear(n_embed, head_size)
        self.Wq = nn.Linear(n_embed, head_size)
        self.Wv = nn.Linear(n_embed, head_size)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_out):
        #assume x is of shape (B, T, C)
        #assume enc_out is of shape (B, T, C)

        K = self.Wk(enc_out) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(enc_out) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**(1/2) #(B, T, T)
        attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)
        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, head_size)

        return out


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, head_size, decoder):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, decoder) for _ in range(n_heads)])
        #output of heads is of size (B, T, n_heads*head_size)
        self.proj = nn.Linear(head_size * n_heads, n_embed)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.proj(out)
        out = self.dropout(out)

        return out

class MultiHeadCrossAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.heads = nn.ModuleList([crossHead(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_heads*head_size, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out):
        x = torch.cat([h(x, enc_out) for h in self.heads], dim = -1)
        x = self.proj(x)
        x = self.dropout(x)

        return x

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(n_embed, 4*n_embed)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(4*n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)

        return x

class EncoderBlock(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads

        self.sa = MultiHeadSelfAttention(head_size, decoder = False)
        self.ffw = FeedForward()
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        #assume input x is of size (B, T, C)
        x = self.layernorm1(x) #(B, T, C)
        x = x + self.sa(x) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x = self.ffw(x) #(B, T, C)

        return x

class DecoderCrossBlock(nn.Module):
    #one implementation of the multi head cross attention block in the decoder
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads
        self.heads = MultiHeadCrossAttention(head_size)
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)
        self.ffw = FeedForward()

    def forward(self, x):
        #assume x is input of shape (B, T, C), x is the output of the decoder self attention layer
        #assume x is a list of length 2: first element is the output of the previous hidden layer, and the 2nd element is the output of the encoder
        x = x[0]
        enc_out = x[1]
        
        x = self.layernorm1(x) #(B, T, C)
        x = x + self.heads(x, enc_out) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x + self.ffw(x) #(B, T, C)

        return [x, enc_out]

class DecoderSelfBlock(nn.Module):
    #one implementation of the multi head self attention block in the decoder
    def __init__(self):
        super().__init__()
        head_size = n_embed//n_heads
        self.sa = MultiHeadSelfAttention(head_size, decoder = True)
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)
        self.ffw = FeedForward()

    def forward(self, x):
        #assume x is of shape (B, T, C)
        x = self.layernorm1(x)
        x = x + self.sa(x)
        x = self.layernorm2(x)
        x = x + self.ffw(x)

        return x

class Transformer(nn.Module):

    def __init__(self):
        super().__init__()
        self.tok_embedding_matrix_x = nn.Embedding(vocab_size, n_embed)
        self.pos_embedding_x = PositionalEncoding(n_embed, dropout = dropout)

        self.tok_embedding_matrix_y = nn.Embedding(vocab_size, n_embed)
        self.pos_embedding_y = PositionalEncoding(n_embed, dropout = dropout)

        self.EncoderBlocks = nn.Sequential(*[EncoderBlock() for _ in range(n_layers)])
        self.DecoderSelfBlocks = nn.Sequential(*[DecoderSelfBlock() for _ in range(n_layers)])
        self.DecoderCrossBlocks = nn.Sequential(*[DecoderCrossBlock() for _ in range(n_layers)])

        self.final_layernorm = nn.LayerNorm(n_embed)
        self.final_linear = nn.Linear(n_embed, vocab_size)
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, x, y, targets = None):
        B, T = x.shape
        C = n_embed
        
        tok_embed_x = self.tok_embedding_matrix_x(x)
        pos_embed_x = self.pos_embedding_x(token_embed_x.view(T,B,C)).view(B, T, C)

        tok_embed_y = self.tok_embedding_matrix_y(y)
        pos_embed_y = self.pos_embedding_y(token_embed_y.view(T,B,C)).view(B, T, C)

        x = tok_embed_x + pos_embed_x
        y = tok_embed_y + pos_embed_y

        #encoder
        enc_out = self.EncoderBlocks(x)

        #decoder self
        y = self.DecoderSelfBlocks(y)
        
        #decoder cross
        y = self.DecoderCrossBlocks([y, enc_out])

        #grab the transformed decoder input from the cross attention layer
        y = y[0]
        #remaining layers
        y = self.final_layernorm(y)
        y = self.final_linear(y)

        if targets is not None:
            print(targets.shape)
            logits = y.view(B*T, -1)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)

        else:
            loss = None

        return logits, loss



In [45]:
model = Transformer()
m = model.to(device)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

20.254266 M parameters


In [46]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [49]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, t, t = get_batch(split)
            logits, loss = model(x, y, t)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb, tb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb, tb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


In [None]:
filepath = os.getcwd()+"\\machine_model\\model.pt"
torch.save(model.state_dict(), filepath)
print("model saved at:", filepath)

rough work

In [12]:
xb, yb, t = get_batch('train')

tensor([22624, 43153, 64466,  2846,  7700, 43289, 29971, 24214, 58505, 65689,
        38889, 27415, 71818, 32024, 39233, 32323])


In [13]:
t.shape

torch.Size([16, 64])

In [14]:
yb.device

device(type='cpu')

In [15]:
tok_embedding_matrix = nn.Embedding(vocab_size, n_embed)
pos_embedding = PositionalEncoding(n_embed, dropout = dropout)

In [16]:
token_embed_x = tok_embedding_matrix(xb) #(B, T, C)
B, T, C = token_embed_x.shape
pos_embed_x = pos_embedding(token_embed_x.view(T,B,C)).view(B, T, C) #(B, T, C)

input = token_embed_x + pos_embed_x #(B, T, C)

token_embed_y = tok_embedding_matrix(yb) #(B, T, C)
B, T, C = token_embed_y.shape
pos_embed_y = pos_embedding(token_embed_y.view(T,B,C)).view(B, T, C) #(B, T, C)

target = token_embed_y + pos_embed_y #(B, T, C)

In [17]:
input.shape

torch.Size([16, 64, 512])

In [18]:
e = Head(head_size = 64, decoder = False)
e = MultiHeadSelfAttention(64, decoder = False)
e = EncoderBlock()

In [19]:
e(input).shape

out_e = e(input)

In [20]:
d = Head(head_size = 64, decoder = True)
d = MultiHeadSelfAttention(head_size = 64, decoder = True)
d = DecoderSelfBlock()

In [21]:
d(target).shape

out_d = d(target)

In [22]:
c = crossHead(head_size = 64)
c = MultiHeadCrossAttention(head_size = 64)
c = DecoderCrossBlock()
d = DecoderCrossBlock()

In [23]:
a = torch.cat((out_e, out_d), dim = 0)

In [24]:
a.shape

torch.Size([32, 64, 512])

In [26]:
m = Transformer()

In [27]:
m(xb, yb, t)

torch.Size([16, 64])


(tensor([[ 0.3266,  0.9439, -0.6072,  ..., -0.8942, -0.2462,  0.5541],
         [ 0.3558, -0.0816, -1.0313,  ...,  0.2293, -0.4682,  1.1330],
         [ 1.0091, -0.0226, -0.3073,  ..., -1.1772,  0.4133,  1.5344],
         ...,
         [ 0.7896,  0.3576, -0.5905,  ...,  0.0613, -0.5474,  0.6708],
         [ 0.1137,  0.1101, -0.9164,  ...,  0.1007, -0.6001,  0.5187],
         [ 0.6944,  0.8457, -0.8022,  ..., -0.2061, -0.3018,  0.6769]],
        grad_fn=<ViewBackward0>),
 tensor(11.1942, grad_fn=<NllLossBackward0>))

In [None]:
encoder = tiktoken.get_encoding("cl100k_base")


In [None]:
tokenizer.encode("<|PAD|>", allowed_special = specials)

[0]

In [None]:
encoding.decode([83, 1609, 5963, 374, 2294, 0])

'tiktoken is great!'

In [None]:
# block_size = 10
# num_examples = 5

# en_max = 0 
# with open(os.getcwd()+'\\data\\train_en.txt', 'r', encoding='utf8') as f:
#     idx_en = []
#     sentences_en = []
#     for i in tqdm(range(num_examples)):
#         line = f.readline()
#         line = line.replace("\n", "")
#         len_pad = 0
#         sentence = "<|PAD|> " + (line) + " <|PAD|>"
#         print(sentence)
#         print(tokenizer.encode(sentence, allowed_special = specials))
#         tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
#         print(len(tok_sentence))
#         if len(tok_sentence) > en_max:
#             en_max = len(tok_sentence)
#             print(en_max)

#         if len(tok_sentence) <= block_size:
#             len_pad = block_size - len(tok_sentence)
#             tok_sentence = tok_sentence + len_pad*[100277]
#             assert len(tok_sentence) == block_size, print(len(tok_sentence))
#             #idx_en.append(i)
#             sentences_en.append(tok_sentence)
#         else:
#             sentences_en.append(block_size*[100280])

# print(en_max)    
# print(f"Length of sentences: {len(sentences_en)}")


# de_max = 0 
# with open(os.getcwd()+'\\data\\train_de.txt', 'r', encoding='utf8') as f:
#     idx_de = []
#     sentences_de = []
#     for i in tqdm(range(num_examples)):
#         line = f.readline()
#         line = line.replace("\n", "")
#         len_pad = 0
#         sentence = "<|START|> " + (line) + " <|END|>"
#         tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
#         if len(tok_sentence) > de_max:
#             de_max = len(tok_sentence)
#             print(de_max)

#         if len(tok_sentence) <= block_size:
#             len_pad = block_size - len(tok_sentence)
#             tok_sentence = tok_sentence + len_pad*[100277]
#             assert len(tok_sentence) == block_size, print(len(tok_sentence))
#             #idx_en.append(i)
#             sentences_de.append(tok_sentence)
#         else:
#             sentences_de.append(block_size*[100280])
            
# print(de_max)  
# print(f"Length of sentences: {len(sentences_de)}")

  0%|          | 0/5 [00:00<?, ?it/s]

<|PAD|> iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould edges ( corners ) of the steel ingot mould . <|PAD|>
[0, 11245, 24532, 374, 264, 5644, 369, 1005, 25982, 902, 374, 17551, 439, 264, 1488, 1169, 555, 2231, 1919, 22145, 477, 14654, 304, 279, 51370, 13116, 320, 24359, 883, 315, 279, 9699, 6892, 354, 51370, 662, 220, 0]
38
38
<|PAD|> iron cement protects the ingot against the hot , abrasive steel casting process . <|PAD|>
[0, 11245, 24532, 36236, 279, 6892, 354, 2403, 279, 4106, 1174, 94804, 9699, 25146, 1920, 662, 220, 0]
18
<|PAD|> a fire restant repair cement for fire places , ovens , open fireplaces etc . <|PAD|>
[0, 264, 4027, 2800, 519, 13023, 24532, 369, 4027, 7634, 1174, 297, 21778, 1174, 1825, 4027, 27170, 5099, 662, 220, 0]
21
<|PAD|> Construction and repair of highways and ... <|PAD|>
[0, 24987, 323, 13023, 315, 60395, 323, 2564, 220, 0]
10
<|PAD|> An announcement must be commercial character . <|PAD|>
[0, 1556, 17480, 2

  0%|          | 0/5 [00:00<?, ?it/s]

60
60
Length of sentences: 5


In [None]:
# sentences = np.array([sentences_en, sentences_de])

# idx = np.where(sentences == 100280)
# print(idx[1])

[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 4 4 4 4
 4 4 4 4 4 4]


In [None]:
# sentences = np.delete(sentences, idx[1], axis = 1)

In [None]:
# sentences

array([], shape=(2, 0, 10), dtype=int32)