In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bhavikbb/password-strength-classifier-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/bhavikbb/password-strength-classifier-dataset/versions/1


In [None]:
import kagglehub
import os
import pandas as pd

# Download the dataset
path = kagglehub.dataset_download("bhavikbb/password-strength-classifier-dataset")

# List files in the dataset directory
files = os.listdir(path)

# Load the CSV, skipping rows that cause parsing errors
for file in files:
    if file.endswith('.csv'):
        df = pd.read_csv(f"{path}/{file}", on_bad_lines='skip')  # New method for skipping bad lines
        print(f"Loaded file: {file}")
        print(df.head())


Loaded file: data.csv
      password  strength
0     kzde5577         1
1     kino3434         1
2    visi7k1yr         1
3     megzy123         1
4  lamborghin1         1


In [None]:
df = df.dropna()
len(df)

669639

In [None]:
# df = df["password"]
df.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [None]:
df = df[:1000]

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. Preprocessing the Data (Character-level tokenization)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['password'])
sequences = tokenizer.texts_to_sequences(df['password'])

# Padding sequences to ensure uniform length
max_len = max([len(seq) for seq in sequences])  # Find max sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Convert the padded sequences into a tensor
passwords_tensor = torch.tensor(padded_sequences, dtype=torch.long)

# Define vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding

# 2. Building the GRU Model
class PasswordEmbeddingGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PasswordEmbeddingGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, embedding_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # Convert input to embeddings
        gru_out, _ = self.gru(embedded)  # Pass through GRU
        final_embedding = self.fc(gru_out[:, -1, :])  # Use last hidden state
        return final_embedding  # Return the final password embedding

# Hyperparameters
embedding_dim = 128
hidden_dim = 256

# Initialize model
model = PasswordEmbeddingGRU(vocab_size, embedding_dim, hidden_dim)

# 3. Training Setup
# Define loss function and optimizer
criterion = nn.MSELoss()  # Dummy loss (you can replace this based on your task)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train-test split
X_train, X_test = train_test_split(passwords_tensor, test_size=0.2, random_state=42)

# Training loop
num_epochs = 10
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(X_train)

    # Dummy target: you can set an actual target based on the task you have
    target = torch.zeros_like(outputs)  # Dummy target just for example

    loss = criterion(outputs, target)

    # Backward pass
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

# 4. Generate embeddings for new passwords after training
def generate_password_embedding(model, password):
    # Tokenize the new password
    password_seq = tokenizer.texts_to_sequences([password])
    padded_password = pad_sequences(password_seq, maxlen=max_len, padding='post')
    password_tensor = torch.tensor(padded_password, dtype=torch.long)

    # Get embedding
    model.eval()
    with torch.no_grad():
        embedding = model(password_tensor)

    return embedding.squeeze().numpy()

# Example of generating an embedding for a password
password_example = 'new_password123'
password_embedding = generate_password_embedding(model, password_example)
print("Generated Password Embedding:", password_embedding)


Epoch 1/10, Loss: 0.0019800602458417416
Epoch 2/10, Loss: 0.0010983009124174714
Epoch 3/10, Loss: 0.0005572650698013604
Epoch 4/10, Loss: 0.0002509174810256809
Epoch 5/10, Loss: 0.00012055172555847093
Epoch 6/10, Loss: 0.0001047380719683133
Epoch 7/10, Loss: 0.0001234858063980937
Epoch 8/10, Loss: 0.00012582368799485266
Epoch 9/10, Loss: 0.0001091381418518722
Epoch 10/10, Loss: 8.87010246515274e-05
Generated Password Embedding: [-1.57242715e-02  3.62249464e-03  8.69044103e-03  1.05034094e-03
  5.23357466e-03  1.04535446e-02  1.08139366e-02 -4.42904653e-04
 -1.38211586e-02 -4.69093397e-03 -7.66784512e-03  1.62564144e-02
  1.07680373e-02 -6.91546593e-03 -2.37826146e-02 -5.39306179e-03
 -6.37572818e-03 -7.94062763e-03  2.23353691e-03 -4.59892303e-03
  5.08611090e-04  9.13178548e-03 -5.10834670e-03  7.75700808e-03
  2.42042425e-03  1.06046535e-02 -1.03702024e-03  6.36914186e-03
 -2.02379152e-02 -3.78810568e-03  7.09792599e-03 -1.08549166e-02
 -4.60846722e-03 -1.56431645e-03 -1.46568976e-02

In [None]:
max_len

31

In [None]:
# After training your model, save it using torch.save()
model_save_path = 'password_embedding_gru.pth'

# Save the trained model's state dictionary (parameters)
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")


Model saved to password_embedding_gru.pth


In [None]:
import pickle

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

print("Tokenizer saved successfully.")


Tokenizer saved successfully.


In [None]:
# To reuse the model, first define the model architecture again
class PasswordEmbeddingGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PasswordEmbeddingGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, embedding_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # Convert input to embeddings
        gru_out, _ = self.gru(embedded)  # Pass through GRU
        final_embedding = self.fc(gru_out[:, -1, :])  # Use last hidden state
        return final_embedding  # Return the final password embedding

# Hyperparameters (must match the original model)
embedding_dim = 128
hidden_dim = 256
vocab_size = len(tokenizer.word_index) + 1  # Make sure tokenizer is initialized with the same data

# Initialize the model architecture
loaded_model = PasswordEmbeddingGRU(vocab_size, embedding_dim, hidden_dim)

# Load the saved model weights
loaded_model.load_state_dict(torch.load('password_embedding_gru.pth'))

# Set the model to evaluation mode (important for inference)
loaded_model.eval()

print("Model loaded and ready for use.")


Model loaded and ready for use.


  loaded_model.load_state_dict(torch.load('password_embedding_gru.pth'))


In [None]:
# Example: Generate embedding for a new password using the loaded model
def generate_password_embedding(model, password, tokenizer, max_len):
    # Tokenize the new password
    password_seq = tokenizer.texts_to_sequences([password])
    padded_password = pad_sequences(password_seq, maxlen=max_len, padding='post')
    password_tensor = torch.tensor(padded_password, dtype=torch.long)

    # Get embedding
    with torch.no_grad():
        embedding = model(password_tensor)

    return embedding.squeeze().numpy()

# Example usage:
password_example = 'new_password123'
password_embedding = generate_password_embedding(loaded_model, password_example, tokenizer, max_len)
# print("Generated Password Embedding:", password_embedding)


In [None]:
# Define the GRU model architecture
class PasswordEmbeddingGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PasswordEmbeddingGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, embedding_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # Convert input to embeddings
        gru_out, _ = self.gru(embedded)  # Pass through GRU
        final_embedding = self.fc(gru_out[:, -1, :])  # Use last hidden state
        return final_embedding  # Return the final password embedding

# Hyperparameters (make sure these match your setup)
embedding_dim = 128
hidden_dim = 256
vocab_size = 5000  # Example vocabulary size

# Initialize the model
model = PasswordEmbeddingGRU(vocab_size, embedding_dim, hidden_dim)

# Print the model architecture
print(model)


PasswordEmbeddingGRU(
  (embedding): Embedding(5000, 128, padding_idx=0)
  (gru): GRU(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=128, bias=True)
)
