# Importing neccesary libs, data and preprocessing

## Import libs and data

In [2]:
import numpy as np
import pandas as pd
import seaborn as  sns
import matplotlib.pyplot as plt

df = pd.read_csv('dataset/creditcard.csv')
colors = sns.color_palette("RdPu", 10)


In [10]:
X = df.drop('Class', axis=1)
y = df['Class']

## Scaling amount and time features

In [4]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# RobustScaler is less prone to outliers.

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)
#df.head().T

## Random Under-Sampling

In [5]:
#shuffle the data before creating the subsamples
df = df.sample(frac=1)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

# Shuffle dataframe rows
undersampled_df = normal_distributed_df.sample(frac=1, random_state=42)
# undersampled_df.head().T

## Removing outliers

In [6]:
# # -----> V14 Removing Outliers (Highest Negative Correlated with Labels)
v14_fraud = undersampled_df['V14'].loc[undersampled_df['Class'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
v14_iqr = q75 - q25

v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
undersampled_df = undersampled_df.drop(undersampled_df[(undersampled_df['V14'] > v14_upper) | (undersampled_df['V14'] < v14_lower)].index)

# -----> V12 removing outliers from fraud transactions
v12_fraud = undersampled_df['V12'].loc[undersampled_df['Class'] == 1].values
q25, q75 = np.percentile(v12_fraud, 25), np.percentile(v12_fraud, 75)
v12_iqr = q75 - q25

v12_cut_off = v12_iqr * 1.5
v12_lower, v12_upper = q25 - v12_cut_off, q75 + v12_cut_off
undersampled_df = undersampled_df.drop(undersampled_df[(undersampled_df['V12'] > v12_upper) | (undersampled_df['V12'] < v12_lower)].index)

# -----> V10 removing outliers from fraud transactions
v10_fraud = undersampled_df['V10'].loc[undersampled_df['Class'] == 1].values
q25, q75 = np.percentile(v10_fraud, 25), np.percentile(v10_fraud, 75)
v10_iqr = q75 - q25

v10_cut_off = v10_iqr * 1.5
v10_lower, v10_upper = q25 - v10_cut_off, q75 + v10_cut_off
outliers = [x for x in v10_fraud if x < v10_lower or x > v10_upper]
undersampled_df = undersampled_df.drop(undersampled_df[(undersampled_df['V10'] > v10_upper) | (undersampled_df['V10'] < v10_lower)].index)

## Dimensionality Reduction and Clustering for undersampled data

In [7]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
# T-SNE Implementation
X_undersampled = undersampled_df.drop('Class', axis=1)
y_undersampled = undersampled_df['Class']
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X_undersampled.values)

# PCA Implementation
X_reduced_pca = PCA(n_components=2, random_state=42).fit_transform(X_undersampled.values)

# TruncatedSVD
X_reduced_svd = TruncatedSVD(n_components=2, algorithm='randomized', random_state=42).fit_transform(X_undersampled.values)



In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:-1], df.iloc[:,-1], test_size=0.2, random_state=42)

## Oversampling imbalanced data using RandomOverSampler

In [11]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_oversampled_ros, y_oversampled_ros = ros.fit_resample(X,y) 

## Oversampling using SMOTE Technique

In [12]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_oversampled_smote, y_oversampled_smote = smote.fit_resample(X, y)

## Split data into training and test sets