In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib as plt

In [3]:
file = "Resources\sample_base.csv"

df = pd.read_csv(file)

In [4]:
X = df.drop(columns="fraud_bool")
y =  df["fraud_bool"]

In [5]:
numerical = X.select_dtypes(include=['int64','float64']).columns
categorical = X.select_dtypes(include=['object','bool']).columns

In [6]:
numerical_pipeline = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())  # Normalize numerical features
])

In [7]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical),
        ('cat', categorical_pipeline, categorical)
    ])

In [9]:
# Apply preprocessing to the data
X_cleaned = preprocessor.fit_transform(X)


In [10]:
cleaned_columns = (
    numerical.tolist() +
    preprocessor.named_transformers_['cat']['onehot']
    .get_feature_names_out(categorical).tolist()
)

In [14]:
df_cleaned = pd.DataFrame(X_cleaned, columns=cleaned_columns)
df_cleaned['fraud_bool'] = y.values
#uncomment  for non PCA data
#df_cleaned.to_csv("Resources/normalized.csv")

In [12]:


pca = PCA()
X_pca = pca.fit_transform(X_cleaned)
pca = PCA(n_components=0.90)
X_reduced = pca.fit_transform(X_cleaned)

df_reduced = pd.DataFrame(X_reduced)
df_reduced["fraud_bool"] = y.values
df_reduced.to_csv("Resources/PCA.csv")

In [13]:
df_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,fraud_bool
0,-0.282102,1.099428,1.481313,-0.442914,-0.876998,3.858568,-1.904171,0.735522,0.644899,-0.091246,...,-1.778545,-0.976760,-0.633680,1.006806,0.146943,-0.229311,0.045608,-0.457344,-1.744343,0
1,2.959391,-0.715304,0.027905,-0.137106,-0.248278,-1.432453,0.625891,-0.423915,0.137788,0.213532,...,1.089601,0.673118,-0.338153,0.866623,-0.090366,-0.029705,-0.032232,-0.775828,-0.365946,0
2,0.147841,0.131806,-0.894444,-0.658209,0.233696,-1.498730,-0.338417,0.528867,2.255751,-1.096259,...,-0.310140,0.551301,-2.087260,-1.164296,-0.950288,0.255750,0.319637,-0.236910,0.611304,0
3,0.731221,1.148620,2.093847,-0.064521,0.673564,-0.961079,1.585175,0.844304,-0.443269,-0.557694,...,-0.284487,-0.394006,0.443098,0.521741,-0.457607,0.455234,-0.913233,0.815290,0.542470,0
4,3.386940,1.476763,-1.407237,0.157372,-0.535172,-0.560178,-0.641337,-0.304162,0.459441,-0.328488,...,-0.795742,0.106112,0.410353,1.062500,-0.153537,1.097380,-3.415004,0.568089,0.495064,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,-0.756309,-0.286599,-0.628150,-0.302614,-2.806092,1.360697,-0.916936,-0.724435,-0.785748,0.521542,...,0.124222,-1.590328,-0.297200,0.063299,2.173663,0.164401,-0.888895,-0.003652,0.147667,0
199996,-0.407315,4.346276,1.965272,-1.786176,-1.110015,-3.970034,-0.651512,-0.409299,-1.881502,2.403750,...,-2.075251,-0.011331,-0.532898,0.832757,-0.684709,1.202885,-0.708609,-0.905013,-1.839066,0
199997,-3.067658,-1.641912,0.121962,0.602266,-1.562479,-0.435911,-1.543554,0.663270,0.386304,-0.468547,...,0.143248,0.282250,1.318135,0.453252,-1.186537,-0.589364,-0.411417,-0.032493,-0.197299,0
199998,-1.003560,0.455019,-1.910426,-2.153091,1.877976,3.224863,1.348033,-0.497913,0.668164,-2.468295,...,1.926232,-1.283602,1.129442,-1.321086,-0.941903,-0.006127,0.317133,0.919984,-0.764721,0
