In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bad-password/users.csv
/kaggle/input/bad-password/google-10000-english.txt
/kaggle/input/bad-password/10_million_password_list_top_10000.txt
/kaggle/input/pwneddataset/data.csv


The master dataset used here is a list of exposed passwords and their frequency of exposure. It can be retrieved at: https://github.com/robinske/password-data/blob/master/passwords/part-00000-abca9f4b-5795-47ee-8382-f523480a532f.csv

This filtering/processing of data is based on a Kaggle notebook that can be retrieved here: https://www.kaggle.com/kingabzpro/bad-password-nist

In [2]:
# Importing the pandas module
import pandas as pd

# Loading in datasets/users.csv 
users = pd.read_csv("../input/pwneddataset/data.csv")

# Printing out how many users we've got
print(users.size)
# Taking a look at the 12 first users
users.head(12)

13690


Unnamed: 0,password,count
0,1980289,13.0
1,tornadof,13.0
2,vova87653,13.0
3,XpKvShrO,13.0
4,tvjgtl,13.0
5,stava0,13.0
6,keannn,13.0
7,xoPTDK,13.0
8,supagordon,13.0
9,XQyz7B4c7,13.0


In [3]:
# Calculating the lengths of users' passwords
users['length'] = users['password'].str.len()

# Taking a look at the 12 first rows
users.head(12)

Unnamed: 0,password,count,length
0,1980289,13.0,7
1,tornadof,13.0,8
2,vova87653,13.0,9
3,XpKvShrO,13.0,8
4,tvjgtl,13.0,6
5,stava0,13.0,6
6,keannn,13.0,6
7,xoPTDK,13.0,6
8,supagordon,13.0,10
9,XQyz7B4c7,13.0,9


In [4]:
# Reading in the top 10000 passwords
common_passwords = pd.read_csv("../input/bad-password/10_million_password_list_top_10000.txt",
                              header=None,squeeze=True)

# Taking a look at the top 20
common_passwords.head(20)

0        123456
1      password
2      12345678
3        qwerty
4     123456789
5         12345
6          1234
7        111111
8       1234567
9        dragon
10       123123
11     baseball
12       abc123
13     football
14       monkey
15      letmein
16       696969
17       shadow
18       master
19       666666
Name: 0, dtype: object

In [5]:
# Flagging the users with passwords that are common passwords
users['common_password'] = users['password'].str.lower().isin( common_passwords)

# Taking a look at the 12 first rows
users.head(12)

Unnamed: 0,password,count,length,common_password
0,1980289,13.0,7,False
1,tornadof,13.0,8,False
2,vova87653,13.0,9,False
3,XpKvShrO,13.0,8,False
4,tvjgtl,13.0,6,False
5,stava0,13.0,6,False
6,keannn,13.0,6,False
7,xoPTDK,13.0,6,False
8,supagordon,13.0,10,False
9,XQyz7B4c7,13.0,9,False


In [6]:
# Reading in a list of the 10000 most common words
words = pd.read_csv("../input/bad-password/google-10000-english.txt",
                              header=None,squeeze=True)

# Flagging the users with passwords that are common words
users['common_word'] = users['password'].str.lower().isin(words)

# Taking a look at the 12 first rows
users.head(12)

Unnamed: 0,password,count,length,common_password,common_word
0,1980289,13.0,7,False,False
1,tornadof,13.0,8,False,False
2,vova87653,13.0,9,False,False
3,XpKvShrO,13.0,8,False,False
4,tvjgtl,13.0,6,False,False
5,stava0,13.0,6,False,False
6,keannn,13.0,6,False,False
7,xoPTDK,13.0,6,False,False
8,supagordon,13.0,10,False,False
9,XQyz7B4c7,13.0,9,False,False


In [7]:
### Flagging the users with passwords with >= 4 repeats
users['too_many_repeats'] = users['password'].str.contains(r'(.)\1\1\1\1\1')

# Taking a look at the users with too many repeats

users.head(10)

  return func(self, *args, **kwargs)


Unnamed: 0,password,count,length,common_password,common_word,too_many_repeats
0,1980289,13.0,7,False,False,False
1,tornadof,13.0,8,False,False,False
2,vova87653,13.0,9,False,False,False
3,XpKvShrO,13.0,8,False,False,False
4,tvjgtl,13.0,6,False,False,False
5,stava0,13.0,6,False,False,False
6,keannn,13.0,6,False,False,False
7,xoPTDK,13.0,6,False,False,False
8,supagordon,13.0,10,False,False,False
9,XQyz7B4c7,13.0,9,False,False,False


In [8]:
# Flagging all passwords that are bad - if frequency more than 30
users['bad_password'] = users['count'] > 30
users.head(10)

Unnamed: 0,password,count,length,common_password,common_word,too_many_repeats,bad_password
0,1980289,13.0,7,False,False,False,False
1,tornadof,13.0,8,False,False,False,False
2,vova87653,13.0,9,False,False,False,False
3,XpKvShrO,13.0,8,False,False,False,False
4,tvjgtl,13.0,6,False,False,False,False
5,stava0,13.0,6,False,False,False,False
6,keannn,13.0,6,False,False,False,False
7,xoPTDK,13.0,6,False,False,False,False
8,supagordon,13.0,10,False,False,False,False
9,XQyz7B4c7,13.0,9,False,False,False,False


In [9]:
#Check if all letters are upper or lowercase or all digits or all special chars
users['all_lower_case'] = users['password'].str.islower()
users['all_upper_case'] = users ['password'].str.isupper()
users['all_digits'] = users['password'].str.isdigit()
users['all_special']= ~(users['password'].str.isalnum())

users.head(100)

Unnamed: 0,password,count,length,common_password,common_word,too_many_repeats,bad_password,all_lower_case,all_upper_case,all_digits,all_special
0,1980289,13.0,7,False,False,False,False,False,False,True,False
1,tornadof,13.0,8,False,False,False,False,True,False,False,False
2,vova87653,13.0,9,False,False,False,False,True,False,False,False
3,XpKvShrO,13.0,8,False,False,False,False,False,False,False,False
4,tvjgtl,13.0,6,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
95,wemy,14.0,4,False,False,False,False,True,False,False,False
96,w2342m,14.0,6,False,False,False,False,True,False,False,False
97,phkphk,14.0,6,False,False,False,False,True,False,False,False
98,whackbal,14.0,8,False,False,False,False,True,False,False,False


In [10]:
#including the number of uppercase,lowercase, and special chars
users['upper'] = users['password'].str.findall(r'[A-Z]').str.len()
users['lower'] = users['password'].str.findall(r'[a-z]').str.len()
users['digits'] = users ['password'].str.findall(r'[0-9]').str.len()
users["symbols"] = users["password"].apply(lambda p: sum( not p.isalnum() for q in p ))
users.head(10)

Unnamed: 0,password,count,length,common_password,common_word,too_many_repeats,bad_password,all_lower_case,all_upper_case,all_digits,all_special,upper,lower,digits,symbols
0,1980289,13.0,7,False,False,False,False,False,False,True,False,0,0,7,0
1,tornadof,13.0,8,False,False,False,False,True,False,False,False,0,8,0,0
2,vova87653,13.0,9,False,False,False,False,True,False,False,False,0,4,5,0
3,XpKvShrO,13.0,8,False,False,False,False,False,False,False,False,4,4,0,0
4,tvjgtl,13.0,6,False,False,False,False,True,False,False,False,0,6,0,0
5,stava0,13.0,6,False,False,False,False,True,False,False,False,0,5,1,0
6,keannn,13.0,6,False,False,False,False,True,False,False,False,0,6,0,0
7,xoPTDK,13.0,6,False,False,False,False,False,False,False,False,4,2,0,0
8,supagordon,13.0,10,False,False,False,False,True,False,False,False,0,10,0,0
9,XQyz7B4c7,13.0,9,False,False,False,False,False,False,False,False,3,3,3,0


In [11]:
#Run this cell if you would like to use the dataset. Credits would be nice :)
users.to_csv('new_dataset.csv')