In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

file_path = "D:\\save\\fraudulent.csv"
data = pd.read_csv(file_path)
print(data.isnull().sum())

imputer = SimpleImputer(strategy='most_frequent')
data_imputed = imputer.fit_transform(data.drop(columns=['y']))
data_imputed_df = pd.DataFrame(data_imputed, columns=data.columns[:-1])
data_imputed_df['y'] = data['y']

X = data_imputed_df.drop(columns=['y'])
y = data_imputed_df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

models = {
    'KNN': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier())
    ]),
    'DecisionTree': Pipeline([
        ('scaler', StandardScaler()), 
        ('classifier', DecisionTreeClassifier())
    ]),
    'LogisticRegression': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=10000)) 
    ]),
    'SVC': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC(probability=True))
    ])
}

best_f1 = 0
best_model = None
best_model_name = ""
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{model_name} F1 Score: {f1}")
    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_model_name = model_name
print(f"Best Model: {best_model_name} with F1 Score: {best_f1}")

contain_IP                 90
is_long                    89
is_tinyurl                 88
contain_at                 82
contain_double_slash      116
contain_dash               94
contain_subdomain          97
is_SSL                     96
with_long_history        2795
contain_icon             1358
contain_ext_domain       1527
contain_email_to         2079
allow_right_click        3407
contain_pop_up_window     279
contain_Iframe            659
has_DNSRecord            1201
traffic                  1507
google_rank               664
y                           0
dtype: int64
KNN F1 Score: 0.839718530101642
DecisionTree F1 Score: 0.8648648648648648
LogisticRegression F1 Score: 0.85
SVC F1 Score: 0.8614649681528662
Best Model: DecisionTree with F1 Score: 0.8648648648648648
