In [None]:
%pip install -r requirements.txt

## Dictionary Attack Example

The dictionary attack uses a pre-compiled list or wordlist of commonly used passwords and matches them with the targeted password 

reference: 

Alkhwaja, I., Albugami, M., Alkhwaja, A., Alghamdi, M.,
Abahussain, H., Alfawaz, F., Almurayh, A., and Min-Allah,
N. (2023). Password cracking with brute force algorithm
and dictionary attack using parallel programming. Applied
Sciences, 13(10)

In [None]:
# PikePdf is a Python library allowing creation, manipulation and repair of PDFs
# We will use PikePDf to create a password protected pdf which we will later crack the password into
import pikepdf
import time

# Colorama produces colored terminal text
from colorama import Fore

In [None]:
# The password dictionary we will be working with
# unzip rockyou zip file and open it here

'''Back in 2009, a company named RockYou was hacked. This wouldn't have been too much of a problem if they
 hadn't stored all of their passwords unencrypted, in plain text for an attacker to see. 
 They downloaded a list of all the passwords and made it publically available. 
 This file is commonly used for brute force or dictionary attacks on web applications.'''

# zipFile is python library for unzipping files
# rockyou.txt is stored in a zip file due to its large size
from zipfile import ZipFile 

# unzipping rockyou.zip
with ZipFile('rockyou.zip', 'r') as zObj:
    zObj.extract('rockyou.txt')
    zObj.close()

# Opening rockyou password dictionary
passwordDict = open('rockyou.txt','r')

In [None]:
# Encrypting our sample pdf with a password using pikepdf python module
pdf = pikepdf.Pdf.open('sample.pdf') 

# name of the protected pdf with password
protected = 'protected.pdf'

In [None]:
# Adding password protection to the pdf
passKey = 'password123'

# you can change the R from 4 to 6 for 256 aes encryption
pdf.save(protected, encryption=pikepdf.Encryption(owner = passKey,user=passKey,R=4))

pdf.close()


In [None]:
# Iterating through our password dictionary to crack the password

# Timing how long it takes
t0 = time.time()
for guess in passwordDict:
    try:
        pikepdf.open(protected,password = guess.strip())
        t1 = time.time()

        print(Fore.GREEN + 'Password Found: ' + guess )
        print('It took: ' + str(t1 - t0) + ' seconds')
        correct = guess
        break

    except:
        print(Fore.RED + 'Trying Passwords: '+ guess.strip())
        continue

In [None]:
# Opening the protected pdf with the now cracked password
cracked = pikepdf.open(protected,password = guess.strip())

# Getting the first page of the cracked pdf
page1 = cracked.pages[0]

In [None]:
# finding the image key of the image on the first page of the pdf
imageKey = list(page1.images.keys())
imageKey

In [None]:
# The raw image of the pdf
rawimage = page1.images[imageKey[0]]

In [None]:
# Extracting the image from the protected pdf
pdfimage = pikepdf.PdfImage(rawimage)
pdfimage.extract_to(fileprefix='image')
pdfimage

## Predicting Password Strength with Machine Learning

It is important to evaluate your password strength in order to protect against unwanted access to personal and sensitive information

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix



In [None]:
# Converts sequences of text to smaller Parts (tokens)
def getTokens(inputStr):
    tokens = []
    for token in inputStr:
        tokens.append(token)
    return tokens

In [None]:
# Password Strength Classifier Dataset
# Contains Password and Score from 0-2 
'''
https://www.kaggle.com/datasets/bhavikbb/password-strength-classifier-dataset?resource=download
'''
data = pd.read_csv('data.csv',usecols=['password','strength'])
data = pd.DataFrame(data)

# cleaning data from NA values and non-numeric scores
data.fillna(' ',inplace=True)
data = data[pd.to_numeric(data['strength'], errors='coerce').notnull()]


In [None]:
# Passwords and Labels (Scores)
passwords = np.array(data)
passKeys = [key[0] for key in passwords]
labels = [int(lab[1]) for lab in passwords]

In [None]:
# Vectorizing data
vectorizer = TfidfVectorizer(tokenizer=getTokens)
X = vectorizer.fit_transform(passKeys)

In [None]:
# Train Test Split
# Training on 80% of the dataset and testing on the remaining 20%
X_train, X_test, y_train, y_test = train_test_split(X,labels, test_size=0.20, random_state=42)

In [None]:
# Logistic Regression classifier
# Logistic regression is easier to implement, interpret and very efficient to train
# By default, logistic regression cannot be used for classification tasks that have more than two labels
# For each label we build a logistic regression to find the probability the observation belongs to that label

# penalty='12': adds a L2 penalty term and it is the default choice
# L2 penalty function uses the sum of the squares of the parameters and Ridge Regression encourages this sum to be small.
# multiclass='ovr': binary problem is fit for each label (score)
logReg = LogisticRegression(penalty='l2', multi_class='ovr')

# Fitting the Logistic Regression Classifier
logReg.fit(X_train,y_train)

In [None]:
# Prediction
logReg_pred = logReg.predict(X_test)

In [None]:
# Accuracy
print('Test Accuracy: {}'.format(round(logReg.score(X_test, y_test)*100,3)))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, logReg_pred)

# Creating visual of confusion matrix
sns.heatmap(cm,annot=True,fmt='.0f',cmap='Blues')
plt.title('Confusion Matrix for Predicted Score Vs Actual Score')
plt.xlabel('Predicted Score')
plt.ylabel('Actual Score')
plt.savefig('cm.png')
plt.show()




In [None]:
# Custom Testing
X_pred = ['jlutz2477', 'Arsenal@45', 'BobaFett2016']
X_pred = vectorizer.transform(X_pred)
y_pred = logReg.predict(X_pred)
print(y_pred)