## Import necessary libraries

In [78]:
import numpy as np
import pandas as pd
import joblib
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

## Data Exploration
Next, we'll import our dataset which is a csv with two columns: [password, strength]
The strength is represented as an integer between 0-2 (0 - weak, 1 - medium, 2 - strong).
Our dataset contains almost 670k passwords which will be used to train and test our model.

In [79]:
dataset = pd.read_csv("password_strength_dataset.csv", on_bad_lines='skip')
dataset.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [80]:
dataset.shape

(669640, 2)

In [81]:

def character_tokenizer(password):
    return list(password)

# Drop NaN rows
dataset.dropna(inplace=True)
x = np.array(dataset["password"])
y = np.array(dataset["strength"])

vectorizer = TfidfVectorizer(tokenizer=character_tokenizer)
x_vectorized = vectorizer.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, random_state=2, test_size=0.1)



In [82]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
joblib.dump({"model": model, "vectorizer": vectorizer}, 'password_detector_model.pkl')

print(model.score(x_test, y_test))


0.9577683531449734


In [83]:
print(model.predict(vectorizer.transform(["123", "12Jsk39asIIfm01gh"])).tolist())

[0, 2]
