In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib
import os

print("--- ML Model Trainer (using local CSV) ---")

# --- 1. Define Paths ---
DATA_PATH = os.path.join("..", "data", "transaction_dataset.csv")
MODEL_PATH = os.path.join("phishing_classifier.pkl")
SCALER_PATH = os.path.join("feature_scaler.pkl")

# --- 2. Load Dataset ---
# TODO: Make sure your CSV file is in the /data/ folder
try:
    df = pd.read_csv(DATA_PATH)
    print(f"Successfully loaded dataset from {DATA_PATH}")
except FileNotFoundError:
    print(f"ERROR: Dataset not found at {DATA_PATH}")
    print("Please download/move your dataset and save it there.")
    # Stop execution if file isn't found
    raise

print("Original columns:", df.columns.tolist())

# --- 3. Feature Engineering ---
# We create numerical features from your string-based columns
# a. 'Value': Already a number, just ensure it's numeric
df['Value'] = pd.to_numeric(df['Value'], errors='coerce').fillna(0)

# b. 'Input': Get the length of the input data. Scams often have large/small inputs.
#    .str.len() assumes it's a string, we'll handle '0x' prefix
df['Input'] = df['Input'].astype(str)
df['input_data_length'] = df['Input'].apply(lambda x: len(x.replace('0x', '')))

# c. 'ContractAddress': Create a binary feature: 1 if it IS a contract interaction, 0 if not.
df['is_contract_interaction'] = df['ContractAddress'].notna().astype(int)

# d. 'Class': This is our target. Assuming 1 for Phishing, 0 for Safe.
# TODO: Verify this! If 'Class' is "Phishing" and "Safe" strings, uncomment the next line
# df['Class'] = df['Class'].apply(lambda x: 1 if x == 'Phishing' else 0)

print("Engineered features 'input_data_length' and 'is_contract_interaction'.")

# --- 4. Feature Selection & Preprocessing ---
# These are the "inputs" for the model. We use our *newly engineered* features.
features = ['Value', 'input_data_length', 'is_contract_interaction']
target = 'Class' # This is your 'is_phishing' column

print(f"Using Features: {features}")
print(f"Using Target: {target}")

X = df[features]
y = df[target]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Features selected and scaled.")

# --- 5. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"Data split: {len(X_train)} training, {len(X_test)} testing.")

# --- 6. Train the Model ---
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training complete.")

# --- 7. Evaluate the Model ---
y_pred = model.predict(X_test)
print("\n--- Model Evaluation ---")
print(classification_report(y_test, y_pred))

# --- 8. Save the Model and Scaler ---
joblib.dump(model, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)
print(f"Model saved to {MODEL_PATH}")
print(f"Scaler saved to {SCALER_PATH}")
print("\nSUCCESS! You can now run the backend API.")

--- ML Model Trainer (using local CSV) ---
Successfully loaded dataset from ..\data\transaction_dataset.csv
Original columns: ['TxHash', 'BlockHeight', ' TimeStamp', 'From', 'To', 'Value', 'ContractAddress', 'Input', 'Class']
Engineered features 'input_data_length' and 'is_contract_interaction'.
Using Features: ['Value', 'input_data_length', 'is_contract_interaction']
Using Target: Class
Features selected and scaled.
Data split: 93887 training, 23472 testing.
Model training complete.

--- Model Evaluation ---
              precision    recall  f1-score   support

           0       0.90      0.83      0.86     15989
           1       0.69      0.81      0.74      7483

    accuracy                           0.82     23472
   macro avg       0.80      0.82      0.80     23472
weighted avg       0.84      0.82      0.83     23472

Model saved to phishing_classifier.pkl
Scaler saved to feature_scaler.pkl

SUCCESS! You can now run the backend API.
