In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import polars as pl
import pandas as pd
import gc
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
df = pl.scan_csv('rba-dataset.csv')

In [3]:
df = df.collect().to_pandas()

In [4]:
df['Login Timestamp'] = pd.to_datetime(df['Login Timestamp'])
df[['ip_part1', 'ip_part2', 'ip_part3', 'ip_part4']] = df['IP Address'].str.split('.', expand=True)

df = df.astype({'ip_part1': 'int', 'ip_part2': 'int','ip_part3': 'int','ip_part4': 'int'})


df = df.assign(
    DoY=df['Login Timestamp'].dt.dayofyear,
    hour=df['Login Timestamp'].dt.hour,
).assign(
    sin_time_id_day=lambda x: np.sin(2 * np.pi * x['DoY'] / 365),
    cos_time_id_day=lambda x: np.cos(2 * np.pi * x['DoY'] / 365),
    sin_hour=lambda x: np.sin(2 * np.pi * x['hour'] / 24),
    cos_hour=lambda x: np.cos(2 * np.pi * x['hour'] / 24),
)

df = df.drop(columns=["Round-Trip Time [ms]", 'Login Timestamp','IP Address','Region','City'])

df['User Agent String'], _ = pd.factorize(df['User Agent String'])
df['Browser Name and Version'], _ = pd.factorize(df['Browser Name and Version'])
df['OS Name and Version'], _ = pd.factorize(df['OS Name and Version'])

In [5]:
targets = ['Is Account Takeover']
features = df.drop(targets, axis=1).columns

X_train, X_test, y_train, y_test = train_test_split(df[features], df[targets], test_size=0.2, random_state=7,stratify=df[targets])

In [6]:
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = ['Country','Device Type']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


classifiers = {
    'logistic_regression': LogisticRegression(max_iter=1000),
    'random_forest': RandomForestClassifier(),
    'adb' : AdaBoostClassifier(),
    'xgb': XGBClassifier()
}

def make_pipeline(classifier_key):
    if classifier_key in classifiers:
        clf = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', classifiers[classifier_key])
        ])
        return clf
    else:
        raise ValueError(f"Classifier {classifier_key} is not defined")

In [7]:
classifier_key = 'logistic_regression'
pipeline = make_pipeline(classifier_key)
pipeline.fit(X_train, y_train)

# Evaluation
lrpredictions = pipeline.predict(X_test)
probs = pipeline.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, probs)

print(f"AUC Score: {auc_score}")

  y = column_or_1d(y, warn=True)


AUC Score: 0.8616147260733573
