In [3]:
import pandas as pd

folder = "Datasets/"

print("Phishing sample:")
print(pd.read_csv(folder + "phishing_site_urls.csv", nrows=5))

print("\nTrain Transaction sample:")
print(pd.read_csv(folder + "train_transaction.csv", nrows=5))

print("\nTrain Identity sample:")
print(pd.read_csv(folder + "train_identity.csv", nrows=5))

Phishing sample:
                                                 URL Label
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...   bad
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...   bad
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....   bad
3  mail.printakid.com/www.online.americanexpress....   bad
4  thewhiskeydregs.com/wp-content/themes/widescre...   bad

Train Transaction sample:
   TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0        2987000        0          86400            68.5         W  13926   
1        2987001        0          86401            29.0         W   2755   
2        2987002        0          86469            59.0         W   4663   
3        2987003        0          86499            50.0         W  18132   
4        2987004        0          86506            50.0         H   4497   

   card2  card3       card4  card5  ... V330  V331  V332  V333  V334 V335  \
0    NaN  150.0    discover  142.0  ...  NaN   NaN   NaN   NaN

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

folder = "Datasets/"

# Phishing model
print("Training phishing model...")
df = pd.read_csv(folder + "phishing_site_urls.csv")
urls = df["URL"]
labels = df["Label"].map({"good": 0, "bad": 1})
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(urls)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
phishing_model = LogisticRegression(max_iter=1000)
phishing_model.fit(X_train, y_train)
accuracy = phishing_model.score(X_test, y_test)
print(f"Phishing accuracy: {accuracy:.2f}")
joblib.dump(phishing_model, "phishing_model.pkl")
joblib.dump(vectorizer, "phishing_vectorizer.pkl")
print("Phishing model saved!")

# Fraud model
print("Training fraud model...")
df_trans = pd.read_csv(folder + "train_transaction.csv")
df_id = pd.read_csv(folder + "train_identity.csv")
df = df_trans.merge(df_id, on="TransactionID", how="left")
features = ["TransactionAmt", "card4", "P_emaildomain", "DeviceType"]
X = pd.get_dummies(df[features], columns=["card4", "P_emaildomain", "DeviceType"], dummy_na=True)
X = X.fillna(X.mean())
y = df["isFraud"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
fraud_model = LogisticRegression(max_iter=1000)
fraud_model.fit(X_train, y_train)
fraud_model.feature_names_in_ = X_train.columns
accuracy = fraud_model.score(X_test, y_test)
print(f"Fraud accuracy: {accuracy:.2f}")
joblib.dump(fraud_model, "ieee_fraud_model.pkl")
print("Fraud model saved!")

Training phishing model...
Phishing accuracy: 0.94
Phishing model saved!
Training fraud model...
Fraud accuracy: 0.96
Fraud model saved!


In [7]:
import joblib

phishing_model = joblib.load("phishing_model.pkl")
vectorizer = joblib.load("phishing_vectorizer.pkl")
fraud_model = joblib.load("ieee_fraud_model.pkl")

print("Phishing model trained on:", phishing_model.n_features_in_, "features")
print("Fraud model trained on:", fraud_model.n_features_in_, "features")

Phishing model trained on: 5000 features
Fraud model trained on: 69 features


In [9]:
import pandas as pd

folder = "Datasets/"
phishing = pd.read_csv(folder + "phishing_site_urls.csv")
trans = pd.read_csv(folder + "train_transaction.csv")
ident = pd.read_csv(folder + "train_identity.csv")

print("Phishing rows:", len(phishing))
print("Transaction rows:", len(trans))
print("Identity rows:", len(ident))

Phishing rows: 549346
Transaction rows: 590540
Identity rows: 144233
