In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Reading Dataset

In [3]:
data=pd.read_csv('E:\📚\DS\Predict Password Strength using Natural Language Processing\data.csv') #Enter your dataset 
data.head()                                                                                       #location on your computer.


Unnamed: 0,password,strength
0,lkaj8899,1
1,cgsu5858,1
2,xyfy3y1qw,1
3,dyui159,1
4,qwertyuiop0,1


In [4]:
data['strength'].unique() # Here 1 refers average strength, 2 refers strong strength and 0 refers weak strength.

array([1, 2, 0], dtype=int64)

# Code to remove all the missing(Null/NaN) values in my Dataset

In [5]:
data.isna().sum()

password    1
strength    0
dtype: int64

In [6]:
data[data['password'].isnull()]

Unnamed: 0,password,strength
367579,,0


In [7]:
data.dropna(inplace=True)

In [8]:
data.isnull().sum()

password    0
strength    0
dtype: int64

In [9]:
password_arr=np.array(data)

In [10]:
password_arr

array([['lkaj8899', 1],
       ['cgsu5858', 1],
       ['xyfy3y1qw', 1],
       ...,
       ['184520socram', 1],
       ['marken22a', 1],
       ['fxx4pw4g', 1]], dtype=object)

# Random shuffling will give my model Robustness

In [11]:
import random
random.shuffle(password_arr)

In [12]:
x=[passwords[0] for passwords in password_arr]
y=[strength[1] for strength in password_arr]


In [13]:
x

['lkaj8899',
 'cgsu5858',
 'lkaj8899',
 'lkaj8899',
 'xyfy3y1qw',
 'xyfy3y1qw',
 'u6c8vhow',
 'cgsu5858',
 'v1118714',
 'universe2908',
 'asv5o9yu',
 'as326159',
 'cgsu5858',
 'qwertyuiop0',
 'v1118714',
 'dyui159',
 'g067057895',
 'cgsu5858',
 'qwertyuiop0',
 'idofo673',
 'lkaj8899',
 'asv5o9yu',
 'qwertyuiop0',
 'asv5o9yu',
 'qwertyuiop0',
 'WUt9IZzE0OQ7PkNE',
 'qwertyuiop0',
 'idofo673',
 'asv5o9yu',
 'prisonbreak1',
 '612035180tok',
 'schalke04',
 'gaymaids1',
 'cigicigi123',
 'idofo673',
 'universe2908',
 'jytifok873',
 'czuodhj972',
 '612035180tok',
 'v1118714',
 'exitos2009',
 'elyass15@ajilent-ci',
 'memjan123',
 'WUt9IZzE0OQ7PkNE',
 'fk9qi21m',
 'cgsu5858',
 'asgaliu11',
 'elyass15@ajilent-ci',
 'xyfy3y1qw',
 'fahad123',
 'kswa2mrv',
 'juliel009',
 'ejeko677',
 'sbl571017',
 'WUt9IZzE0OQ7PkNE',
 'a2531106',
 'WUt9IZzE0OQ7PkNE',
 'v1118714',
 'tamanagung6',
 'calcifer32',
 'trabajonet9',
 'trabajonet9',
 'yqugu927',
 'kswa2mrv',
 'jerusalem393',
 'faranumar91',
 'bozoxik602',
 

# Create a function to split the input into characters of list

In [14]:
def split(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

In [15]:
split('CodeSpeedy')

['C', 'o', 'd', 'e', 'S', 'p', 'e', 'e', 'd', 'y']

# Importing TF-IDF Vectorizer 

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
vectorizer=TfidfVectorizer(tokenizer=split)

# Applying TF-IDF vectorizer on my data

In [18]:
Matrix=vectorizer.fit_transform(x) # Transform the data: X

In [19]:
print(vectorizer.vocabulary_)# We can see that all words were made lowercase by default and that the punctuation was ignored if 
                             #it is there.
                             # Returns a dictionary comprised of the tokens and their respective indices in the array.

{'l': 62, 'k': 61, 'a': 51, 'j': 60, '8': 37, '9': 38, 'c': 53, 'g': 57, 's': 69, 'u': 71, '5': 34, 'x': 74, 'y': 75, 'f': 56, '3': 32, '1': 30, 'q': 67, 'w': 73, '6': 35, 'v': 72, 'h': 58, 'o': 65, '7': 36, '4': 33, 'n': 64, 'i': 59, 'e': 55, 'r': 68, '2': 31, '0': 29, 't': 70, 'p': 66, 'd': 54, 'z': 76, 'b': 52, 'm': 63, '@': 44, '-': 26, '>': 42, '.': 27, '&': 21, '?': 43, '<': 40, '!': 16, '_': 49, '%': 20, '$': 19, '(': 22, ')': 23, '"': 17, '~': 80, '+': 25, '^': 48, '/': 28, ';': 39, ' ': 15, '#': 18, '*': 24, '±': 87, '`': 50, '[': 45, ']': 47, '\\': 46, '=': 41, 'ú': 120, '\x1c': 13, '³': 89, 'ô': 115, '{': 77, '}': 79, '¿': 97, 'þ': 124, 'ó': 114, '\x19': 11, 'ä': 103, '\x16': 9, 'ò': 113, '·': 92, '\x1e': 14, 'ß': 99, 'å': 104, '‚': 128, '´': 90, '\x7f': 81, 'ð': 111, 'â': 102, '¾': 96, 'à': 100, '|': 78, 'ÿ': 125, 'õ': 116, '\x05': 2, '\x1b': 12, '«': 85, 'í': 109, '÷': 118, '\x10': 6, '\x17': 10, '°': 86, 'µ': 91, '\x08': 4, 'ê': 108, 'á': 101, '¡': 83, 'ý': 123, 'ü': 122,

In [20]:
Matrix.shape #We have 669639 rows (669639 passwords) and 124 columns (124 unique words).

(669639, 132)

In [21]:
vectorizer.get_feature_names() # Print all the features(unique words) of the vectorizer.

['\x02',
 '\x04',
 '\x05',
 '\x06',
 '\x08',
 '\x0e',
 '\x10',
 '\x11',
 '\x12',
 '\x16',
 '\x17',
 '\x19',
 '\x1b',
 '\x1c',
 '\x1e',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 '+',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '\x7f',
 '\x81',
 '¡',
 '¨',
 '«',
 '°',
 '±',
 '²',
 '³',
 '´',
 'µ',
 '·',
 'º',
 '»',
 '¼',
 '¾',
 '¿',
 '×',
 'ß',
 'à',
 'á',
 'â',
 'ä',
 'å',
 'æ',
 'ç',
 'é',
 'ê',
 'í',
 'ï',
 'ð',
 'ñ',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 '÷',
 'ù',
 'ú',
 'û',
 'ü',
 'ý',
 'þ',
 'ÿ',
 'œ',
 '—',
 '‚',
 '…',
 '‹',
 '›']

In [22]:
FirstDocVector=Matrix[0]
FirstDocVector

<1x132 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [23]:
FirstDocVector.T.todense() # Print the sparse matrix of the test data.

matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.59724979],
        [0.56603192],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [24]:
df=pd.DataFrame(FirstDocVector.T.todense(),index=vectorizer.get_feature_names(),columns=['TF-IDF'])
df.sort_values(by=['TF-IDF'],ascending=False)

Unnamed: 0,TF-IDF
8,0.597250
9,0.566032
j,0.338087
k,0.301651
l,0.281704
...,...
;,0.000000
7,0.000000
6,0.000000
5,0.000000


# Split the data into Training set and Test set
    Training set-> A subset to train my model
    Test set-> A subset to test my trained model

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
Matrix_train, Matrix_test, y_train, y_test=train_test_split(Matrix,y,test_size=0.3) 
#X->independent data, x->dependent data
#0.3->30% of my data is considered for the testing purpose. Therefore the rest 70% of my data will be condidered for the 
#training purpose.

In [27]:
Matrix_train.shape

(468747, 132)

# Applying Logistic Regression as we are doing Classification of Passwords

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
clf=LogisticRegression(random_state=0,multi_class='multinomial')

In [30]:
clf.fit(Matrix_train,y_train)

LogisticRegression(multi_class='multinomial', random_state=0)

## Performing predictions on the strength of passwords which are outside of my data

In [31]:
dt=np.array(['CodeSpeedy123@#$'])
prediction=vectorizer.transform(dt)
clf.predict(prediction)

array([2])

# Performing prediction on Matrix-Test data

In [32]:
y_prediction=clf.predict(Matrix_test)
y_prediction

array([0, 1, 1, ..., 1, 1, 1])

### Checking accuracy of my model using confusion_matrix,accuracy_score

In [33]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [34]:
cm=confusion_matrix(y_test,y_prediction)
print(cm)
print(accuracy_score(y_test,y_prediction))

[[  8106  18714     27]
 [  5929 139107   3974]
 [    53   7581  17401]]
0.8194154072835155


# Report of my model

In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_prediction))

              precision    recall  f1-score   support

           0       0.58      0.30      0.40     26847
           1       0.84      0.93      0.88    149010
           2       0.81      0.70      0.75     25035

    accuracy                           0.82    200892
   macro avg       0.74      0.64      0.68    200892
weighted avg       0.80      0.82      0.80    200892

