## Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

## Data Exploration
Next, we'll import our dataset which is a csv with two columns: [password, strength]
The strength is represented as an integer between 0-2 (0 - weak, 1 - medium, 2 - strong).
Our dataset contains almost 670k passwords which will be used to train and test our model.

In [2]:
dataset = pd.read_csv("cleaned_password_strength_dataset.csv", on_bad_lines='skip')
dataset.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [3]:
dataset.shape

(669641, 2)

In [4]:
# Drop NaN rows
dataset.dropna(inplace=True)
x = np.array(dataset["password"])
y = np.array(dataset["strength"])

vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), min_df=100)
x_vectorized = vectorizer.fit_transform(x)
print(x_vectorized.shape)
x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, random_state=2, test_size=0.25)

(669640, 11767)


In [5]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
joblib.dump({"model": model, "vectorizer": vectorizer}, 'password_detector_model.pkl')

print(model.score(x_test, y_test))


0.9308703183800251


In [6]:
print(model.predict(vectorizer.transform(["123", "12Jsk39asIIfm01gh"])).tolist())

[0, 1]
