In [None]:
import keras
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.sequence import pad_sequences

In [None]:
import pandas as pd
import random
import string
import numpy as np

In [None]:
words = []
labels = []

for line in open('goodpasswords.txt'):
   words.append(line.split(',')[0])
   labels.append(1)

for line in open('badpasswords.txt'):
   words.append(line.split(',')[0])
   labels.append(0)


In [None]:
def replace2(word):
  w = ''
  for el in word:
    if el == 'A':
      w+='#'
    elif el =="H":
      w+='?'
    else:
      w+=el
  return w

def replace1(word):
  w = ''
  for el in word:
    if el == 'A':
      w+='#'
    else:
      w+=el
  return w

In [None]:
val_accuracies = []
test_accuracies = []
test1_accuracies = []
test2_accuracies = []

for seed in range(200):


    random.seed(seed)
    combined_data = list(zip(words, labels))
    random.shuffle(combined_data)
    words, labels = zip(*combined_data)

    train_words = words[:1000]
    train_labels = labels[:1000]

    test_words = words[1000:1500]
    test_labels = labels[1000:1500]

    valid_words = words[1500:]
    valid_labels = labels[1500:]

    max_word_length = max(len(word) for word in train_words)
    char_tokens = [list(word) for word in train_words]

    # Create a vocabulary of unique characters
    char_set = set([char for word in char_tokens for char in word]+['#','?'])
    num_chars = len(char_set)

    # Create a dictionary to map characters to indices
    char_indices = {char: i + 1 for i, char in enumerate(char_set)}
    indices_char = {i + 1: char for i, char in enumerate(char_set)}

    # Convert words to sequences of character indices
    sequences = [[char_indices[char] for char in word] for word in char_tokens]

    # Pad sequences to ensure equal length
    padded_sequences = pad_sequences(sequences, maxlen=max_word_length)

    # Convert labels to categorical
    num_classes = len(set(train_labels))
    categorical_labels = keras.utils.to_categorical(train_labels, num_classes)

    # Build the CNN model
    model = Sequential()
    model.add(Embedding(input_dim=num_chars + 1, output_dim=50, input_length=max_word_length))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    hist = model.fit(padded_sequences, categorical_labels, epochs=100, batch_size=32, validation_split=0.2)
    val_accuracies.append(hist.history['val_accuracy'][-1])
    test_sequences = [[char_indices[char] for char in word] for word in test_words]
    padded_test_sequences = pad_sequences(test_sequences, maxlen=max_word_length)

    # Convert labels to categorical
    categorical_test_labels = keras.utils.to_categorical(test_labels, num_classes)

    # Evaluate the model on the test data
    loss, accuracy = model.evaluate(padded_test_sequences, categorical_test_labels)
    test_accuracies.append(accuracy)
    print(f'Test Loss: {loss:.4f}')
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

    test1_words = [replace1(wd)for wd in test_words]

    diff = 0
    letters = 0
    for i in range(len(test_words)):
       diff+= sum(l1 != l2 for l1, l2 in zip(test1_words[i],test_words[i]))
       letters+=len(test_words[i])

    print (diff)
    print (diff*100.0/letters)
    test_sequences = [[char_indices[char] for char in word] for word in test1_words]
    padded_test_sequences = pad_sequences(test_sequences, maxlen=max_word_length)

    # Convert labels to categorical
    categorical_test_labels = keras.utils.to_categorical(test_labels, num_classes)

    # Evaluate the model on the test data
    loss, accuracy = model.evaluate(padded_test_sequences, categorical_test_labels)
    test1_accuracies.append(accuracy)
    print(f'Test Loss: {loss:.4f}')
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

    test1_words = [replace2(wd)for wd in test_words]

    diff = 0
    letters = 0
    for i in range(len(test_words)):
       diff+= sum(l1 != l2 for l1, l2 in zip(test1_words[i],test_words[i]))
       letters+=len(test_words[i])

    print (diff)
    print (diff*100.0/letters)
    test_sequences = [[char_indices[char] for char in word] for word in test1_words]
    padded_test_sequences = pad_sequences(test_sequences, maxlen=max_word_length)

    # Convert labels to categorical
    categorical_test_labels = keras.utils.to_categorical(test_labels, num_classes)

    # Evaluate the model on the test data
    loss, accuracy = model.evaluate(padded_test_sequences, categorical_test_labels)
    test2_accuracies.append(accuracy)
    print(f'Test Loss: {loss:.4f}')
    print(f'Test Accuracy: {accuracy * 100:.2f}%')


In [None]:
print (val_accuracies)
print ('mean',np.mean(val_accuracies))
print ('std',np.std(val_accuracies))

print ('test accuracies')
print (test_accuracies)
print ('mean',np.mean(test_accuracies))
print ('std',np.std(test_accuracies))

print ('one letter replaced test accuracies')
print (test1_accuracies)
print ('mean',np.mean(test1_accuracies))
print ('std',np.std(test1_accuracies))

print ('two letter replaced test accuracies')
print (test2_accuracies)
print ('mean',np.mean(test2_accuracies))
print ('std',np.std(test2_accuracies))