In [None]:
%pip install -r requirements.txt

## Dictionary Attack Example

The dictionary attack uses a pre-compiled list or wordlist of commonly used passwords and matches them with the targeted password 

reference: 

Alkhwaja, I., Albugami, M., Alkhwaja, A., Alghamdi, M.,
Abahussain, H., Alfawaz, F., Almurayh, A., and Min-Allah,
N. (2023). Password cracking with brute force algorithm
and dictionary attack using parallel programming. Applied
Sciences, 13(10)

In [None]:
# A password dictionary is a list of commonly used passwords
# The password dictionary we will be working with is called rockyou.txt

'''Back in 2009, a company named RockYou was hacked. This wouldn't have been too much of a problem if they
 hadn't stored all of their passwords unencrypted, in plain text for an attacker to see. 
 They downloaded a list of all the passwords and made it publically available. 
 This file is commonly used for brute force or dictionary attacks on web applications.
 rockyou.txt contains 14,341,564 unique passwords, used in 32,603,388 accounts.
 
 https://www.kaggle.com/datasets/wjburns/common-password-list-rockyoutxt 
 '''

# zipFile is python library for unzipping files
# rockyou.txt is stored in a zip file due to its large size
from zipfile import ZipFile 

# unzipping rockyou.zip
with ZipFile('rockyou.zip', 'r') as zObj:
    zObj.extract('rockyou.txt')
    zObj.close()

# Opening rockyou password dictionary
passwordDict = open('rockyou.txt','r')

In [None]:
# Now we are going to create a password-protected pdf which we will crack into using our password dictionary
# We will use PikePDf to create a password protected pdf which we will later crack the password into
# PikePdf is a Python library allowing creation, manipulation and repair of PDFs

import pikepdf

# Encrypting our sample pdf with a password using pikepdf python module
pdf = pikepdf.Pdf.open('sample.pdf') 

# name of the protected pdf with password
protected = 'protected.pdf'

In [None]:
# Adding password protection to the pdf
passKey = 'ronaldinho10'

# you can change the R from 4 to 6 for 256 aes encryption
pdf.save(protected, encryption=pikepdf.Encryption(owner = passKey,user=passKey,R=4))

pdf.close()

In [None]:
# Colorama produces colored terminal text
from colorama import Fore
import time

# Iterating through our password dictionary to crack the password

# Timing how long it takes
t0 = time.time()

# number of guesses
num_guess = 0

# Each password guess in dictionary
for guess in passwordDict:
    try:

        # attempting to open protected pdf
        pikepdf.open(protected,password = guess.strip())

        # end time
        t1 = time.time()

        # printing out the found password in green
        print(Fore.GREEN + 'Password Found: ' + guess )

        # printing out how long it took and number of guesses
        print('It took: ' + str(t1 - t0) + ' seconds with '+ str(num_guess)+ ' attempts')
        correct = guess
        break

    except:

        # printing out the attempted password that was unsucessful
        print(Fore.RED + 'Trying Passwords: '+ guess.strip())
        num_guess += 1
        
        continue

In [None]:
# Opening the protected pdf with the now cracked password
cracked = pikepdf.open(protected,password = guess.strip())

# Getting the first page of the cracked pdf
page1 = cracked.pages[0]

In [None]:
# finding the image key of the image on the first page of the pdf
imageKey = list(page1.images.keys())

In [None]:
# The raw image of the pdf
rawimage = page1.images[imageKey[0]]

In [None]:
# Extracting the image from the protected pdf
# We only extracted an image of a wave from a pdf, but we can see how dictionary attacks 
# can be used to access potential sensitive information
pdfimage = pikepdf.PdfImage(rawimage)
pdfimage.extract_to(fileprefix='image')
pdfimage

## Evaluating Password Strength with Machine Learning

It is important to evaluate your password strength in order to protect against unwanted access to personal and sensitive information

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Logistic Regression classifier
# Logistic regression is easier to implement, interpret and very efficient to train
# By default, logistic regression cannot be used for classification tasks that have more than two labels
# Our Labels are scores for our password strength which are 0-2, weak medium strong
# For each label (score) we build a logistic regression to find the probability that password belongs to that label


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix



In [None]:
# Password Strength Classifier Dataset
# Contains Passwords and Scores from 0-2 
# Password - 670k unique values for passwords collected online
# Strength - three values(0 , 1 , 2) i.e. 0 for weak, 1 for medium, 2 for strong
'''
https://www.kaggle.com/datasets/bhavikbb/password-strength-classifier-dataset?resource=download
'''
data = pd.read_csv('data.csv',usecols=['password','strength'])
data = pd.DataFrame(data)

# cleaning data from NA values and non-numeric scores
data.fillna(' ',inplace=True)
data = data[pd.to_numeric(data['strength'], errors='coerce').notnull()]


In [None]:
# Passwords and Labels (Scores)
passwords = np.array(data)
passKeys = [key[0] for key in passwords]
labels = [int(lab[1]) for lab in passwords]

In [None]:
# Preparing data to train
# Converts sequences of text to smaller Parts (tokens)
def getTokens(inputStr):
    tokens = []
    for token in inputStr:
        tokens.append(token)
    return tokens

# Vectorizing data
# converting input data into vectors of real numbers
vectorizer = TfidfVectorizer(tokenizer=getTokens)
X = vectorizer.fit_transform(passKeys)

In [None]:
# Train Test Split
# Training on 80% of the dataset and testing on the remaining 20% 
# Common to use between 20%-30% for testing
X_train, X_test, y_train, y_test = train_test_split(X,labels, test_size=0.20, random_state=0)

In [None]:
# Creating Logistic Regression Classifier
# penalty='12': adds a L2 penalty term and it is the default choice
# L2 penalty function uses the sum of the squares of the parameters and Ridge Regression encourages this sum to be small.
# Recall for each label we build a logistic regression to find the probability the observation belongs to that label
# multiclass='ovr': binary problem is fit for each label (score)
logReg = LogisticRegression(penalty='l2', multi_class='ovr')

# Fitting the Logistic Regression Classifier
logReg.fit(X_train,y_train)

In [None]:
# Prediction
logReg_pred = logReg.predict(X_test)

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, logReg_pred)

# Creating visual of confusion matrix
sns.heatmap(cm,annot=True,fmt='.0f',cmap='Blues')
plt.title('Confusion Matrix for Predicted Score Vs Actual Score')
plt.xlabel('Predicted Score')
plt.ylabel('Actual Score')
plt.savefig('cm.png')
plt.show()

In [None]:
# Accuracy
print('Test Accuracy: {}'.format(round(logReg.score(X_test, y_test)*100,3)))

In [None]:
# Custom Testing
X_pred = ['jlutz2477', 'Arsenal@45', 'BobaFett2016']
X_pred = vectorizer.transform(X_pred)
y_pred = logReg.predict(X_pred)
print(y_pred)

# As we can see from the password evaluator, it is important to develop a password 
# with many characters and numbers for better strength