In [16]:
from random import random
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [17]:
try:
    print("Loading data")
    df = pd.read_csv("../data/raw/fraudTrain.csv")
    df = df.drop('Unnamed: 0', axis=1)
except FileNotFoundError:
    print("CSV not found")
    df = pd.DataFrame()

Loading data


In [18]:
# Feature engineering
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["hour"] = df.trans_date_trans_time.dt.hour

# Normal hours are between 05:00 and 21:00 and abnormal otherwise
df["is_normal_hour"] = 0
df.loc[df.hour < 5, "is_normal_hour"] = 1
df.loc[df.hour > 21, "is_normal_hour"] = 1

In [19]:
features = ["amt", 'is_normal_hour']

X = df[features]
y = df.is_fraud

X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      stratify=y,
                                                      random_state=1,
                                                      train_size=0.8)

In [20]:
params = {
        "n_estimators": 10,
        "max_depth": 5
    }
    
    
model = RandomForestClassifier(
        **params,
        random_state=1
        )
    
model.fit(X_train, y_train)

predictions = model.predict(X_valid)

acc = accuracy_score(y_valid, predictions, normalize=True)
f1_sc = f1_score(y_valid, predictions)
    
print(f"Accuracy: {acc}")
print(f"F1 score: {f1_sc}")

Accuracy: 0.9956002853452098
F1 score: 0.4768454837230627
