In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession, functions
import pyspark.sql.types as spark_types
import operator
from collections import Counter
import re
import csv
sc = pyspark.SparkContext(appName="Excursion")

In [2]:
def extract_password(uname_pwd):
    try:
        return uname_pwd.split(":")[1]
    except IndexError:
        return None
    
def is_ascii_128(password):
    if not password or not password.strip():
        return False
    try:
        password.encode().decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

# pws = sc.textFile("/Users/albin/Completed\ torrents/BreachCompilation/data/*/*").cache()
rdd_pwd_cnt = sc.textFile('test_text.txt') \
                        .map(extract_password) \
                        .filter(is_ascii_128) \
                        .map(lambda w: (w, 1)) \
                        .reduceByKey(lambda tot, v: tot + v) \
                        .cache()

The Password extractor should extract unique passwords with an associated password-count.

In [3]:
rdd_pwd_cnt.take(20)

[('password123$$', 1),
 ('^^^qwert123', 1),
 ('banana', 5),
 ('password1', 1),
 ('password\\1', 1),
 ('password123', 2),
 ('$$pass123%%', 1),
 ('876\\DSL', 1)]

In [4]:
# # # We consider ASCII-128 characters and symbols only..
DIGITS = list('0123456789')
ALPHAS = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
SPECIALS = list(' !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
PATTERN = r'[0-9]+|[a-zA-Z]+|[ !"#$%&\'()*+,-./:;<=>?@\[\]\^_`{\|}~\\\\]+'

def get_base_structure_format(word, string_len=True):
    # Copies main_script.py function 
    structures = re.findall(PATTERN, word)
    base = '-'
    for structure in structures:
        if structure[0] in ALPHAS:
            base += 'L'
        elif structure[0] in DIGITS:
            base += 'D'
        else:
            base += 'S'

        if string_len:
            base += str(len(structure)) + '-'
        else:
            base += '-'
    return base

rdd_base_struc_form = rdd_pwd_cnt.map(
    lambda w: (get_base_structure_format(w[0], string_len=False), 1)) \
                                         .reduceByKey(lambda tot, v: tot + v) \
                                         .map(lambda w: (w[1], w[0])) \
                                         .sortByKey(ascending=False)

Base structure analysis should yield a list of the most frequently occuring patterns

In [33]:
rdd_base_struc_form.take(20)

[(2, '-L-D-'),
 (1, '-L-D-S-'),
 (1, '-S-L-D-'),
 (1, '-L-'),
 (1, '-L-S-D-'),
 (1, '-S-L-D-S-'),
 (1, '-D-S-L-')]

In [34]:
rdd_base_struc_form.map(lambda w: w[1] + ',' + str(w[0])).saveAsTextFile(
            "rdd_base_struc_form_123")

In [6]:
rdd_base_struc_form_cnt = rdd_pwd_cnt.map(
    lambda w: (get_base_structure_format(w[0], string_len=False), w[1])) \
                                             .reduceByKey(lambda tot, v: tot + v) \
                                             .map(lambda w: (w[1], w[0])) \
                                             .sortByKey(ascending=False)

Base styucture analysis when accounting for duplicate passwords should return higher counts

In [7]:
rdd_base_struc_form_cnt.take(20)

[(5, '-L-'),
 (3, '-L-D-'),
 (1, '-L-D-S-'),
 (1, '-S-L-D-'),
 (1, '-L-S-D-'),
 (1, '-S-L-D-S-'),
 (1, '-D-S-L-')]

In [8]:
def get_base_structures(word_count, use_count=True):
    # copies main_script.py
    structures = re.findall(PATTERN, word_count[0])
    if use_count:
        count = word_count[1]
    else:
        count = 1
    for structure in structures:
        yield (structure, count)

rdd_base_struc_data = rdd_pwd_cnt.flatMap(lambda w: get_base_structures(w, use_count=True)) \
                                 .reduceByKey(lambda tot, v: tot + v) \
                                 .cache()

We show the decomposition of string types by frequency.

In [10]:
rdd_base_struc_data.take(20)

[('password', 5),
 ('$$', 2),
 ('banana', 5),
 ('1', 2),
 ('pass', 1),
 ('%%', 1),
 ('DSL', 1),
 ('123', 5),
 ('^^^', 1),
 ('qwert', 1),
 ('\\', 2),
 ('876', 1)]

In [14]:
def structure_filter(structure, s_type):
    # shadows main_script.py
    if structure[0] in ALPHAS:
        return s_type == 'alpha'
    elif structure[0] in DIGITS:
        return s_type == 'digit'
    else:
        return s_type == 'special'

for s_type in ['alpha', 'digit', 'special']:
    print('Analysing {}'.format(s_type))
    rdd_filter = rdd_base_struc_data.filter(lambda w: structure_filter(w[0], s_type=s_type)).cache()
    for k in rdd_filter.map(lambda w: (w[1], w[0])).sortByKey(ascending=False).take(20):
        print('{}: {}'.format(k[1], k[0]))

Analysing alpha
password: 5
banana: 5
pass: 1
DSL: 1
qwert: 1
Analysing digit
123: 5
1: 2
876: 1
Analysing special
$$: 2
\: 2
%%: 1
^^^: 1


In [24]:
rdd_filter.take(10)

[('$$', 2), ('%%', 1), ('^^^', 1), ('\\', 2)]

In [28]:
rdd_filter.map(lambda w: w[0] + ',' + str(w[1])).saveAsTextFile('csv_out.csv')