In [1]:
import pandas as pd
import torch
import torch.nn as nn
import re
from collections import Counter
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence, pad_packed_sequence
import os
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

#path = r'/kaggle/input/eng-french/eng_french.csv'
path = r'C:\Users\harish-4072\Downloads\eng_french.csv'
df = pd.read_csv(path, names=['English','French'], header=0)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = text.split()  
    return tokens

def tokenize_text(tokens,token_to_id):
    tokens = [token_to_id.get(token,0) for token in tokens]
    return [1] + tokens + [2]

def tokenize_text_without_special_tkns(tokens,token_to_id):
    return [token_to_id.get(token,0) for token in (tokens)]
    
english_sentences = df['English'].dropna().apply(preprocess_text)
french_sentences = df['French'].dropna().apply(preprocess_text)
english_vocab = Counter([token for sentence in english_sentences for token in sentence])
french_vocab = Counter([token for sentence in french_sentences for token in sentence])
english_token_to_id = {token: idx + 1 for idx, token in enumerate(english_vocab)}  
french_token_to_id = {token: idx + 3 for idx, token in enumerate(french_vocab)}

english_token_to_id['<PAD>'] = 0
french_token_to_id['<PAD>'] = 0
french_token_to_id['<SOS>'] = 1
french_token_to_id['<EOS>'] = 2
french_id_to_token= {value:key for key,value in french_token_to_id.items()}
english_vocab_size = len(english_token_to_id)
french_vocab_size = len(french_token_to_id)
english_sequences = english_sentences.apply(lambda x: tokenize_text_without_special_tkns(x, english_token_to_id))
french_sequences = french_sentences.apply(lambda x: tokenize_text(x, french_token_to_id))

In [4]:
class SentencesDataset(Dataset):
    def __init__(self,english_sequences,french_sequences):
        self.english_sequences = english_sequences
        self.french_sequences = french_sequences
        assert len(self.english_sequences) == len(self.french_sequences)

    def __len__(self):
        return len(self.english_sequences)

    def __getitem__(self,idx):
        X= self.english_sequences[idx]
        y= self.french_sequences[idx]
        return torch.tensor(X,dtype=torch.long).to(device),torch.tensor(y,dtype=torch.long).to(device)

In [5]:
def collate_fn(batch):
    X,y = zip(*batch)
    X_lengths = [len(item) for item in X]
    y_lengths = [len(item) for item in y]
    X_padded = pad_sequence(X, batch_first=True, padding_value=0)
    y_padded = pad_sequence(y, batch_first=True, padding_value=0)
    return X_padded, y_padded, X_lengths, y_lengths

In [6]:
english_temp, french_temp = english_sequences[:1000].reset_index(drop=True), french_sequences[:1000].reset_index(drop=True)

In [7]:
dataset = SentencesDataset(english_temp,french_temp)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True,collate_fn = collate_fn)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False,collate_fn = collate_fn)

In [8]:
EMBEDDING_DIM = 30
HIDDEN_DIM = 128
NUM_LAYERS = 1
DROPOUT = 0.3
SRC_VOCAB_SIZE = english_vocab_size  
PAD_IDX = 0 
TRG_VOCAB_SIZE = french_vocab_size  


In [9]:

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers, dropout,padding_idx):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim*2,hidden_dim)

    def forward(self, src):
        #src - batch*l
        #embedded - batch*l*embedding
        #outputs - batch*l*hidden_dim*2
        #final_hidden - 1*batch*hidden_dim (same as decoder GRU hidden size)
        embedded = self.dropout(self.embedding(src)) 
        outputs, hidden = self.rnn(embedded) 
        final_hidden = torch.tanh(
            self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        )
        return outputs, final_hidden.unsqueeze(0) 

In [10]:
class BahdanauAttention(nn.Module):
    
    def __init__(self, encoder_hdim, decoder_hdim):
        super(BahdanauAttention, self).__init__()
        self.linear = nn.Linear((encoder_hdim*2)+decoder_hdim, decoder_hdim)
        self.fc = nn.Linear(decoder_hdim,1,bias=False)

    def forward(self, encoder_outputs, decoder_hidden):
        #et​=vTtanh(W[hs​;st−1​])
        
        #encoder_outputs - batch*l*hidden_dim*2
        #decoder_hidden - 1*batch*hidden_dim 
        #decoder_hidden_expanded - batch*l*hidden_dim
        #a - batch*l* hidden_dim*2+hidden_dim
        #energy - batch*l*hidden_dim
        #output - batch*l*1
        src_len = encoder_outputs.shape[1]
        decoder_hidden = decoder_hidden.permute(1,0,2)
        decoder_hidden_expanded = decoder_hidden.repeat(1, src_len, 1)  
        weighted = torch.cat([decoder_hidden_expanded,encoder_outputs], dim=-1)
        attention_weights = torch.tanh(self.linear(weighted))
        return torch.softmax(self.fc(attention_weights),dim=1)

In [11]:
class BahdanauAttention2(nn.Module):
    def __init__(self, encoder_hdim, decoder_hdim, attention_weight):
        super(BahdanauAttention2, self).__init__()
        self.Wa = nn.Linear((encoder_hdim*2), attention_weight)
        self.Ua = nn.Linear(decoder_hdim, attention_weight)
        self.fc = nn.Linear(attention_weight,1,bias=False)

    def forward(self, encoder_outputs, decoder_hidden):
        #et​=vTtanh(Wa​hs​+Ua​st−1​)
        #encoder_outputs - batch*l*hidden_dim*2
        #decoder_hidden - 1*batch*hidden_dim 
        #a - batch*l* hidden_dim*2+hidden_dim
        #energy - batch*l*hidden_dim
        #output - batch*l*1
        src_len = encoder_outputs.shape[1]
        decoder_hidden = decoder_hidden.permute(1,0,2)
        weighted = torch.tanh(self.Wa(encoder_outputs) + self.Ua(decoder_hidden))
        attention_weights = self.fc(weighted)
        return torch.softmax(self.fc(attention_weights),dim=1)

In [12]:
class Decoder(nn.Module):
    def __init__(self,output_dim, emb_dim, encoder_hdim, decoder_hdim,dropout = 0.5):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.attention = attention
        self.rnn = nn.GRU(emb_dim+(encoder_hdim*2), decoder_hdim, batch_first=True, bidirectional=False)
        self.fc = nn.Linear((decoder_hdim+(encoder_hdim*2)+emb_dim),output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, encoder_outputs, hidden):
        #src = batch
        #encoder_outputs - batch*l*hidden_dim*2
        #hidden - 1*batch*hidden_dim 
        #embedded = batch*1*emd_dim
        #attention_weights - batch*l*1
        #weighted - batch*1*hidden_dim*2
        #energy = batch*1*emd_dim.cat(batch*1*hidden_dim*2) = batch*1*(emd_dim+hidden_dim*2)
        #output, hidden = batch * 1 * hidden_dim
        #torch.cat((output,embedded,weighted.permute(0,2,1)) = batch*1*hidden_dim+(hidden_dim*2)+emb_dim
        #predictions = batch*1*target_vocab_size
        #hidden = 1*batch*hidden_dim
        embedded = self.dropout(self.embedding(src.unsqueeze(1)))
        attn_weights = self.attention(encoder_outputs, hidden)
        weighted = torch.bmm(attn_weights.permute(0,2,1),encoder_outputs)
        rnn_input  = torch.cat((embedded,weighted),dim=2)
        output,hidden = self.rnn(rnn_input ,hidden)
        predictions = self.fc(torch.cat((output,embedded,weighted),dim=2))
        return predictions.squeeze(1), hidden

In [13]:
class SeqToSeq(nn.Module):
    def __init__(self, encoder, decoder):
        super(SeqToSeq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio):
        #src = batch*l
        #trg = batch*MAX_LENGTH
        #src_len = batch*l
        #encoder_outputs - batch*l*hidden_dim*2
        #hidden - 1*batch*hidden_dim 
        #outputs - batch*MAX_LENGTH*TRG_VOCAB_SIZE
        #input - batch
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        encoder_outputs, hidden = self.encoder(src)
        outputs = torch.zeros(batch_size, trg_len, TRG_VOCAB_SIZE)
        input = trg[:, 0]
        for t in range(1,trg_len):
            #predictions - batch*1*target_vocab_size
            #hidden - 1*batch*hidden_dim
            #outputs - batch*MAX_LENGTH*TRG_VOCAB_SIZE
            #top1, input - batch
            predictions, hidden = self.decoder(input, encoder_outputs, hidden)
            outputs[:, t, :] = predictions  
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = predictions.argmax(1) 
            input = trg[:, t] if teacher_force else top1
        return outputs

In [14]:
encoder = Encoder(
    input_dim=SRC_VOCAB_SIZE,
    emb_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    padding_idx = 0
)



In [15]:
attention = BahdanauAttention(
    encoder_hdim= HIDDEN_DIM, 
    decoder_hdim=HIDDEN_DIM
)


In [16]:
# for X, y,_,_ in train_loader:
#     outputs, hidden = encoder(X)
# print(outputs.shape, hidden.shape)
# print(attention(outputs,hidden).shape)
# input = torch.randint(0, 100, (1,32))
# input.squeeze(0).shape
# a,b = decoder(input.squeeze(0), outputs,hidden)
# a.shape,b.shape

In [17]:
decoder = Decoder(
    output_dim=TRG_VOCAB_SIZE,
    emb_dim=EMBEDDING_DIM,
    encoder_hdim=HIDDEN_DIM,
    decoder_hdim=HIDDEN_DIM,
    dropout = 0.5
)

In [18]:
model = SeqToSeq(encoder, decoder).to(device)
if os.path.exists("seq2seq_model_weights_attention.pth"):
    model.load_state_dict(torch.load("seq2seq_model_weights_attention.pth"))

  model.load_state_dict(torch.load("seq2seq_model_weights_attention.pth"))


In [19]:


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 13,284,308 trainable parameters


In [20]:
EPOCHS = 7
LEARNING_RATE = 0.01

In [21]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [22]:
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for src, trg, src_lengths,_ in train_loader:
        optimizer.zero_grad()
        output = model(src, trg,  0.5).to(device)
        output = output[:, 1:].reshape(-1, output.shape[-1])  
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        epoch_loss += loss.item()
        torch.save(model.state_dict(), "seq2seq_model_weights_attention.pth")
    print(f"Epoch: {epoch + 1}/{EPOCHS}, Loss: {epoch_loss / len(train_loader):.4f}")

Epoch: 1/7, Loss: 9.5727
Epoch: 2/7, Loss: 8.4130
Epoch: 3/7, Loss: 6.7398
Epoch: 4/7, Loss: 5.3072
Epoch: 5/7, Loss: 4.7187
Epoch: 6/7, Loss: 4.5389
Epoch: 7/7, Loss: 4.4294


In [23]:
model.eval()
epoch_loss = 0

with torch.no_grad():
    for src, trg, src_lengths, _ in val_loader:
            
        output = model(src, trg, teacher_forcing_ratio=0.5).to(device)
        output = output[:, 1:].reshape(-1, output.shape[-1])  # Ignore <sos> token
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        epoch_loss += loss.item()

    print(epoch_loss / len(val_loader))

5.460605621337891


In [24]:
def infer(model, src,french_token_to_id, max_len=50):
    
    model.eval()
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src)

        # Start with <sos> token
        trg_vocab_size = TRG_VOCAB_SIZE
        input = torch.tensor([1], device=device)
        predictions = []

        for _ in range(max_len):
            output, hidden = model.decoder(input, encoder_outputs, hidden)
            top1 = output.argmax(1)  # Get the token with highest probability
            predictions.append(top1.item())
            print(top1)
            if top1.item() == french_token_to_id['<EOS>']:
                break

            input = top1.to(device)  # Use the predicted token as input for the next step
    return [french_id_to_token[idx] for idx in predictions]


In [25]:
sentence = "I like you"
sentence = preprocess_text(sentence)
sentence = tokenize_text(sentence, english_token_to_id)
output = infer(model, torch.tensor([sentence]).to(device),french_token_to_id)

tensor([22])
tensor([2])


In [26]:
output

['je', '<EOS>']

In [27]:
import re

def read_properties(file_path):
    """Reads a properties file and returns a list of (key, value) tuples."""
    props = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line and not line.startswith("#"):
                key, value = re.split(r"\s*[:=]\s*", line, 1)
                props.append((key.strip(), value.strip()))
    return props


In [28]:
props = read_properties(r'C:\Users\harish-4072\Pictures\ApplicationResources.properties')

In [29]:
updates = props[:10]

In [30]:
updates

[('dc.db.agent.patchinstall.fail', 'Failed Patch Deployments'),
 ('dc.patch.homepage.Patch_SeveritySummary', 'Patch Severity Summary'),
 ('dc.patch.xml.No_Results_matching_this_criteria', 'No data found.'),
 ('vmp.common.professional_no_permission',
  'You do not have enough permission to perform this action. Upgrade to Enterprise Edition to use these features.'),
 ('dc.common.INFO', 'Info'),
 ('dc.common.VULN_SETTINGS', 'Vulnerability Settings'),
 ('dc.pm.msg.sync_failed_title', 'Vulnerability DB Sync Failed!'),
 ('dc.privacy.vmp_return', 'Start using Vulnerability Manager'),
 ('dc.common.vmp.website', 'http://www.vulnerabilitymanagerplus.com'),
 ('dc.admin.title.dc_server_migration', 'Central Server Migration')]

In [31]:
def update_properties(props, updates):
    updated_props = []
    seen_keys = set()
    for key, value in reversed(props):  
        if key in updates:
            if key not in seen_keys:
                updated_props.append((key, updates[key]))
                seen_keys.add(key)
        else:
            updated_props.append((key, value))
    return list(reversed(updated_props))

In [32]:
def write_properties(file_path, props):
    with open(file_path, "w", encoding="utf-8") as file:
        for key, value in props:
            file.write(f"{key}={value}\n")

In [33]:
write_properties(path, update_properties(props, updates))

In [34]:
update_properties(props, updates)

[('dc.db.agent.patchinstall.fail', 'Failed Patch Deployments'),
 ('dc.patch.homepage.Patch_SeveritySummary', 'Patch Severity Summary'),
 ('dc.patch.xml.No_Results_matching_this_criteria', 'No data found.'),
 ('vmp.common.professional_no_permission',
  'You do not have enough permission to perform this action. Upgrade to Enterprise Edition to use these features.'),
 ('dc.common.INFO', 'Info'),
 ('dc.common.VULN_SETTINGS', 'Vulnerability Settings'),
 ('dc.pm.msg.sync_failed_title', 'Vulnerability DB Sync Failed!'),
 ('dc.privacy.vmp_return', 'Start using Vulnerability Manager'),
 ('dc.common.vmp.website', 'http://www.vulnerabilitymanagerplus.com'),
 ('dc.admin.title.dc_server_migration', 'Central Server Migration'),
 ('vulnerbility.addon.title', 'Vulnerability'),
 ('dc.genProp.vmp.supportMail',
  'vulnerabilitymanagerplus-support@manageengine.com'),
 ('dc.genProp.vmp.renewalMail', 'vmp-renewals@manageengine.com'),
 ('dc.genProp.vmp.request_demo',
  'http://www.manageengine.com/vulnerabil

In [35]:
test = "test"
path = r'C:\Users\harish-4072\Pictures\ApplicationResources.properties'