In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib
import plotly.express as px

In [None]:
train = pd.read_excel('2.1. fraudTrain.xlsx')
test = pd.read_excel('2.2. fraudTest.xlsx')

In [None]:
pd.set_option('display.max_columns', None) # display all columns
pd.set_option('display.max_rows', 150) # display all rows
sns.set_style('whitegrid') # set the grid style
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000' # remove the background color
plt.rcParams['xtick.color'] = 'white'  
plt.rcParams['ytick.color'] = 'white'  

In [None]:
from sklearn.model_selection import train_test_split
train_sample = train.sample(frac=0.1, random_state=42) # 10% sample of the data
train_sample, train_val = train_test_split(train_sample, test_size=0.25, random_state=42) # 75% train, 25% validation

In [None]:
train_sample.shape, train_val.shape

In [None]:
train_sample.head()

In [None]:
train_sample.nunique()

In [None]:
train_sample.drop(columns=['Unnamed: 0', 'trans_num', 'unix_time', 'merch_lat','merch_long','trans_date_trans_time'], inplace=True)

In [None]:
test.drop(columns=['Unnamed: 0', 'trans_num', 'unix_time', 'merch_lat','merch_long','trans_date_trans_time'], inplace=True)

In [None]:
train_val.drop(columns=['Unnamed: 0', 'trans_num', 'unix_time', 'merch_lat','merch_long','trans_date_trans_time'], inplace=True)

In [None]:
train_sample.drop(columns=['first','last'],inplace=True)

In [None]:
test.drop(columns=['first','last'],inplace=True)

In [None]:
train_val.drop(columns=['first','last'],inplace=True)

In [None]:
from datetime import datetime
train_sample['age'] = datetime.now().year - train_sample['dob'].dt.year

In [None]:
test['age'] = datetime.now().year - test['dob'].dt.year

In [None]:
train_val['age'] = datetime.now().year - train_val['dob'].dt.year

In [None]:
train_sample.drop(columns=['dob'], inplace=True)

In [None]:
test.drop(columns=['dob'], inplace=True)

In [None]:
train_val.drop(columns=['dob'], inplace=True)

In [None]:
train_sample.nunique()

In [None]:
px.histogram(train_sample, x='city_pop', color='is_fraud', barmode='group') # try changing x to see the distribution

In [None]:
test_sample = test.sample(frac=0.05, random_state=42) # 5% sample of the data

In [None]:
train_sample.shape, test_sample.shape, train_val.shape

In [None]:
numerical_cols = train_sample.select_dtypes(include=(np.number)).columns
categorical_cols = train_sample.select_dtypes(include=('object')).columns
correlation_matrix = train_sample[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

In [None]:
train_sample[numerical_cols].head()

In [None]:
train_sample[categorical_cols].head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder().fit(train_sample[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

In [None]:
train_sample[encoded_cols] = encoder.transform(train_sample[categorical_cols]).toarray()

In [None]:
train_sample.shape

In [None]:
target_col = 'is_fraud'
numerical_cols = numerical_cols.drop('cc_num')

In [None]:
from sklearn.preprocessing import StandardScaler
train_sample[numerical_cols] = StandardScaler().fit_transform(train_sample[numerical_cols])
train_val[numerical_cols] = StandardScaler().fit_transform(train_val[numerical_cols])
test[numerical_cols] = StandardScaler().fit_transform(test[numerical_cols])

In [None]:
train_sample[numerical_cols]

In [None]:
train_sample[encoded_cols].shape

In [None]:
train_sample[numerical_cols].shape

In [None]:
x_train = train_sample[encoded_cols + list(numerical_cols)]

In [None]:
x_train.shape

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
pca.fit(x_train)
n_components = pca.n_components_
n_components

In [None]:
import matplotlib.pyplot as plt
import numpy as np

pca = PCA().fit(x_train)  # Fit PCA without specifying n_components
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(cumulative_variance_ratio, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Components')
plt.grid(True)

# Find the number of components for a desired explained variance, e.g., 0.95
desired_variance = 0.95
components_for_desired_variance = np.where(cumulative_variance_ratio >= desired_variance)[0][0] + 1
print(f"Number of components to retain {desired_variance*100}% variance: {components_for_desired_variance}")

In [None]:
from sklearn.decomposition import IncrementalPCA
reduced_cols = IncrementalPCA(n_components = n_components, batch_size = n_components).fit_transform(x_train)

In [None]:
reduced_cols.shape

In [None]:
from sklearn.ensemble import IsolationForest
predictor = IsolationForest(n_estimators=200, random_state=42, contamination=0.01).fit_predict(reduced_cols)
predictor = np.where(predictor == -1, 2, predictor)  # Temporarily replace -1 with 2 to avoid conflict
predictor = np.where(predictor == 1, 0, predictor)  # Replace 1 with 0
predictor = np.where(predictor == 2, 1, predictor)  # Finally, replace 2 (originally -1) with 1

In [None]:
np.unique(predictor,return_counts=True)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(train_sample[target_col], predictor))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(train_sample[target_col], predictor)

In [None]:
from sklearn.metrics import roc_auc_score
auc_roc_score = roc_auc_score(train_sample[target_col], predictor)
print("AUC-ROC Score:", auc_roc_score)