In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb


In [2]:
file_path = 'data.csv'
df = pd.read_csv(file_path, on_bad_lines='skip')

df.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [4]:
def feature_creation(df : pd.DataFrame) -> pd.DataFrame:
    df['password'] = df['password'].astype(str)
    df['length'] = df['password'].apply(len)
    df['caps_alpbts'] = df['password'].apply(lambda x : len(re.findall('[A-Z]', x)))/df['length']
    df['small_alpbts'] = df['password'].apply(lambda x : len(re.findall('[a-z]', x)))/df['length']
    df['num'] = df['password'].apply(lambda x : len(re.findall('[0-9]', x)))/df['length']
    df['comon_chars'] = df['password'].apply(lambda x : len(re.findall('[@_!#$%^&*()<>?/\\|{ }~:\\[\\]]', x)))/df['length']
    df['unique_chars'] = df['password'].apply(lambda x : len(re.findall('[^a-zA-Z0-9@_!#$%^&*()+-<>?/\|{ }~:\\[\\]]', x)))/df['length']
    return df

df = feature_creation(df)

x_temp = np.array(df["password"])
y = np.array(df["strength"])

In [5]:
df.head()

Unnamed: 0,password,strength,length,caps_alpbts,small_alpbts,num,comon_chars,unique_chars
0,kzde5577,1,8,0.0,0.5,0.5,0.0,0.0
1,kino3434,1,8,0.0,0.5,0.5,0.0,0.0
2,visi7k1yr,1,9,0.0,0.777778,0.222222,0.0,0.0
3,megzy123,1,8,0.0,0.625,0.375,0.0,0.0
4,lamborghin1,1,11,0.0,0.909091,0.090909,0.0,0.0


In [6]:
tfidf = TfidfVectorizer(analyzer="char", lowercase=False, token_pattern=None)
x_temp = tfidf.fit_transform(x_temp).toarray()

In [7]:
print('Shape x_temp :', x_temp.shape)
print('Shape df :', df.iloc[:,3:].shape)

Shape x_temp : (669639, 200)
Shape df : (669639, 5)


In [8]:
x = np.concatenate((x_temp, df.iloc[:,3:].values), axis=1)
x.shape

(669639, 205)

In [9]:
np.concatenate((tfidf.get_feature_names_out(), df.columns[3:]))

array(['\x01', '\x02', '\x04', '\x05', '\x06', '\x08', '\x0e', '\x0f',
       '\x10', '\x11', '\x12', '\x13', '\x16', '\x17', '\x18', '\x19',
       '\x1b', '\x1c', '\x1d', '\x1e', ' ', '!', '"', '#', '$', '%', '&',
       '(', ')', '*', '+', '-', '.', '/', '0', '1', '2', '3', '4', '5',
       '6', '7', '8', '9', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C',
       'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
       'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']',
       '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
       'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
       'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '\x81', '\x8d', '\xa0',
       '¡', '¢', '¤', '¦', '§', '¨', '«', '¯', '°', '±', '²', '³', '´',
       'µ', '¶', '·', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â',
       'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'É', 'Ê', 'Í', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó',
       'Ô', 'Õ', 'Ö', '×', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
rf_model = RandomForestClassifier(n_jobs=-1)
rf_model.fit(x_train, y_train)
print("Score of the model is", rf_model.score(x_test, y_test), ".")