In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import time

start_time = time.time()

print("Loading data...")
data = pd.read_csv("fraudTest.csv")
print("Data loaded successfully.")

data = data.head(100000)

print("Converting date-time column...")
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
print("Date-time column converted to datetime.")

print("Extracting date-time features...")
data['year'] = data['trans_date_trans_time'].dt.year
data['month'] = data['trans_date_trans_time'].dt.month
data['day'] = data['trans_date_trans_time'].dt.day
data['hour'] = data['trans_date_trans_time'].dt.hour
data.drop('trans_date_trans_time', axis=1, inplace=True)
print("Date-time features extracted and column dropped.")

print("Dropping non-relevant columns...")
data.drop(['first', 'last', 'street', 'dob', 'trans_num'], axis=1, inplace=True)
print("Non-relevant columns dropped.")

print("Encoding categorical variables...")
data_encoded = pd.get_dummies(data, columns=['merchant', 'category', 'gender', 'city', 'state', 'job'])
print("Categorical variables encoded.")

print("Separating features and target variable...")
X = data_encoded.drop('is_fraud', axis=1)
y = data_encoded['is_fraud']
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets.")

print("Training the model...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print("Model trained successfully.")

print("Making predictions...")
y_pred = clf.predict(X_test)
print("Predictions made successfully.")

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("Evaluation metrics computed successfully.")

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

fraud_indices = X_test[y_test == 1].index
fraudulent_transactions = data.iloc[fraud_indices]

print("\nDatapoints where fraud has happened:")
print(fraudulent_transactions)

num_frauds = len(fraud_indices)
print("\nNumber of frauds:", num_frauds)

percentage_frauds = (num_frauds / len(y_test)) * 100
print("Percentage of frauds:", percentage_frauds)

true_negatives, false_positives, false_negatives, true_positives = conf_matrix.ravel()

print("\nConfusion Matrix Insights:")
print("True Negatives:", true_negatives)
print("False Positives:", false_positives)
print("False Negatives:", false_negatives)
print("True Positives:", true_positives)

print("\nConclusions:")
print("The model performed well in terms of accuracy.")
print("However, it has a significant number of false positives, which indicates that it incorrectly classified non-fraudulent transactions as fraudulent.")
print("Further optimization of the model may be necessary to reduce false positives.")

end_time = time.time()
execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")


Loading data...
Data loaded successfully.
Converting date-time column...
Date-time column converted to datetime.
Extracting date-time features...
Date-time features extracted and column dropped.
Dropping non-relevant columns...
Non-relevant columns dropped.
Encoding categorical variables...
Categorical variables encoded.
Separating features and target variable...
Splitting data into training and testing sets...
Data split into training and testing sets.
Training the model...
Model trained successfully.
Making predictions...
Predictions made successfully.
Evaluation metrics computed successfully.
Accuracy: 0.99745
Confusion Matrix:
 [[19911     0]
 [   51    38]]

Datapoints where fraud has happened:
       Unnamed: 0               cc_num                            merchant  \
2495         2495     3524574586339330              fraud_Skiles-Ankunding   
86000       86000     4874017206859125                    fraud_Stark-Koss   
89798       89798  4797297220948468262        fraud_Cole,