In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb


In [2]:
file_path = 'data.csv'
df = pd.read_csv(file_path, on_bad_lines='skip')

df.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [3]:
def feature_creation(df : pd.DataFrame) -> pd.DataFrame:
    df['password'] = df['password'].astype(str)
    df['length'] = df['password'].apply(len)
    df['caps_alpbts'] = df['password'].apply(lambda x : len(re.findall('[A-Z]', x)))/df['length']
    df['small_alpbts'] = df['password'].apply(lambda x : len(re.findall('[a-z]', x)))/df['length']
    df['num'] = df['password'].apply(lambda x : len(re.findall('[0-9]', x)))/df['length']
    df['comon_chars'] = df['password'].apply(lambda x : len(re.findall('[@_!#$%^&*()<>?/\\|{ }~:\\[\\]]', x)))/df['length']
    df['unique_chars'] = df['password'].apply(lambda x : len(re.findall('[^a-zA-Z0-9@_!#$%^&*()+-<>?/\|{ }~:\\[\\]]', x)))/df['length']
    return df

df = feature_creation(df)

x_temp = np.array(df["password"])
y = np.array(df["strength"])

In [4]:
df.head()

Unnamed: 0,password,strength,length,caps_alpbts,small_alpbts,num,comon_chars,unique_chars
0,kzde5577,1,8,0.0,0.5,0.5,0.0,0.0
1,kino3434,1,8,0.0,0.5,0.5,0.0,0.0
2,visi7k1yr,1,9,0.0,0.777778,0.222222,0.0,0.0
3,megzy123,1,8,0.0,0.625,0.375,0.0,0.0
4,lamborghin1,1,11,0.0,0.909091,0.090909,0.0,0.0


In [5]:
tfidf = TfidfVectorizer(analyzer="char", lowercase=False, token_pattern=None)
x_temp = tfidf.fit_transform(x_temp).toarray()

In [6]:
print('Shape x_temp :', x_temp.shape)
print('Shape df :', df.iloc[:,3:].shape)

Shape x_temp : (669639, 200)
Shape df : (669639, 5)


In [7]:
x = np.concatenate((x_temp, df.iloc[:,3:].values), axis=1)
x.shape

(669639, 205)

In [8]:
feature_names = np.concatenate((tfidf.get_feature_names_out(), df.columns[3:]))

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [10]:
rf_model = RandomForestClassifier(n_jobs=-1)
rf_model.fit(x_train, y_train)
print("Score of the model is", rf_model.score(x_test, y_test), ".")

Score of the model is 0.9920852995639448 .


In [None]:
def extract_features(password: str, tfidf: TfidfVectorizer, feature_names: np.ndarray) -> np.ndarray:
    password_tfidf = tfidf.transform([password]).toarray()
    length = len(password)
    caps_alpbts = len(re.findall('[A-Z]', password)) / length
    small_alpbts = len(re.findall('[a-z]', password)) / length
    num = len(re.findall('[0-9]', password)) / length
    comon_chars = len(re.findall('[@_!#$%^&*()<>?/\\|{ }~:\\[\\]]', password)) / length
    unique_chars = len(re.findall('[^a-zA-Z0-9@_!#$%^&*()+-<>?/\|{ }~:\\[\\]]', password)) / length
    additional_features = np.array([[length, caps_alpbts, small_alpbts, num, comon_chars, unique_chars]])
    features = np.concatenate([password_tfidf, additional_features], axis=1)
    return features

def predict_strength(password: str, model: RandomForestClassifier, tfidf: TfidfVectorizer, feature_names: np.ndarray) -> int:
    features = extract_features(password, tfidf, feature_names)
    strength_prediction = model.predict(features)
    return strength_prediction[0]

# Example usage
random_password = "YourPassword123!"
predicted_strength = predict_strength(random_password, rf_model, tfidf, feature_names)
print(f"The predicted strength for the password '{random_password}' is:", predicted_strength)