# Part A: Data preprocessing

### A1 Importing packages and dataset

In [None]:
from platform import python_version
python_version() 

In [None]:
# import packages
import pandas as pd
import numpy as np

# import train datasets
train_identity = pd.read_csv("train_identity.csv")
train_transaction = pd.read_csv("train_transaction.csv")

In [None]:
# join datasets together on transactionID
train_df = train_identity.join(train_transaction.set_index('TransactionID'), on='TransactionID', how='inner')
train_df

### A2 Reindexing dataset

In [None]:
# reindex data with fraud as last column
train_df2 = train_df.reindex(columns = [col for col in train_df.columns if col != 'isFraud'] + ['isFraud'])
train_df2

### A3 delete features containing 30% missing data

In [None]:
# check how much features are missing at least 30% of their data
print("Features with more than 30% data missing:",sum(train_df2.isnull().sum()/train_df2.shape[0] > 0.30))
print()
xx = train_df2.isnull().sum()/train_df2.shape[0] > 0.30
print(xx)
print()
#
delfeat = np.where(train_df2.isnull().sum()/train_df2.shape[0] > 0.30)
print(delfeat)

# DROP COLUMNS
train_df3 = train_df2.drop(train_df2.columns[train_df2.apply(lambda col: col.isnull().sum()/train_df2.shape[0] > 0.30)], axis=1)
train_df3

### A4 converting categorical features

In [None]:
# select categorical features
train_cat = train_df3.select_dtypes(exclude=np.number)
train_cat

In [None]:
# check which categorical features can be converted to numerical features
REMAIL = train_cat.iloc[:,0]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,1]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,2]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,3]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,4]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,5]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,6]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,7]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,8]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,9]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,10]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,11]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,12]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,13]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,14]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,15]
print(REMAIL.value_counts())
print()
REMAIL = train_cat.iloc[:,16]
print(REMAIL.value_counts())
print()

# two values: id_12, id_16, id_28, id_29, id_35, id_36, id_37, id_38, devicetype
# three values: id_15, card_6
# more than three values: id_31, deviceinfo, productcd, card4, p_emaildomain, r_emaildomain

In [None]:
# drop categorical features that have more than two values
train_df4 = train_df3
train_df4.drop(['id_15', 'card6','id_31','DeviceInfo','ProductCD','card4','P_emaildomain','R_emaildomain'], axis = 1, inplace = True)
train_df4

In [None]:
# encoding categorical features: id_12, id_16, id_28, id_29, id_35, id_36, id_37, id_38, devicetype
train_df4["id_12"]=train_df4["id_12"].map({"Found":1,"NotFound":0})
train_df4["id_16"]=train_df4["id_16"].map({"Found":1,"NotFound":0})
train_df4["id_28"]=train_df4["id_28"].map({"Found":1,"New":0})
train_df4["id_29"]=train_df4["id_29"].map({"Found":1,"NotFound":0})
train_df4["id_35"]=train_df4["id_35"].map({"T":1,"F":0})
train_df4["id_36"]=train_df4["id_36"].map({"T":1,"F":0})
train_df4["id_37"]=train_df4["id_37"].map({"T":1,"F":0})
train_df4["id_38"]=train_df4["id_38"].map({"T":1,"F":0})
train_df4["DeviceType"]=train_df4["DeviceType"].map({"desktop":1,"mobile":0})

In [None]:
train_df4

### A5 Delete (quasi-)constant features

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y= train_test_split(train_df4.drop("isFraud",axis=1),train_df4.isFraud,test_size=0.2,random_state=0)

In [None]:
from sklearn.feature_selection import VarianceThreshold
qcons_filter = VarianceThreshold(threshold=0.01)

#Fit and transforming on train data
data_qcons = qcons_filter.fit_transform(train_x)
print(data_qcons.shape)

#Extracting all Quasi constant columns using get support function of our filter
qcons_columns = [column for column in train_x.columns
                    if column not in train_x.columns[qcons_filter.get_support()]]

#No. of Quasi constant columns
print(len(qcons_columns))

#Quasi Constant columns names:
for column in qcons_columns:
    print(column)

In [None]:
# drop (quasi-)constant features
train_df6 = train_df4.drop(qcons_columns,axis=1)
train_df6.shape
train_df6

### A6 Drop rows containing missing values

In [None]:
# drop rows containing missing values
final_df = train_df6.dropna()
final_df

### A7 create dataframe without target column isFraud

In [None]:
# drop target column isFraud
final_df1 = final_df.drop("isFraud",axis=1)
final_df1

### A8 Create insight about percentage of fraud in the dataset

In [None]:
# compute percentage fraud after joining datasets on transactionID
fraud2 = final_df.iloc[:,-1]
print(fraud2.value_counts())
print()
print('Percentage fraud: ',str(round((100/91325)*8182,1))+'%')

import matplotlib.pyplot as plt

# Make a random dataset:
data = [8182, 91325]
bars = (0, 1)
y_pos = np.arange(len(bars))

# Create bars
plt.bar(y_pos, data)
plt.title('Amount of fraudulent transactions')
plt.xlabel('Legal vs. fraud')
plt.ylabel('Number of transactions')

# Create names on the x-axis
plt.xticks(y_pos, bars)

# Show graphic
plt.show()

# Part B: Modelling

## B1 Logistic regression

### B1.1 Logistic regression before SMOTE

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# define dataset
X = final_df1
y = final_df.isFraud

# split train test set 0.8 om 0.2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train) 
xtest = sc_x.transform(X_test)

# create model
from sklearn.linear_model import LogisticRegressionCV
classifier = LogisticRegressionCV(cv =5, max_iter=10000,random_state = 0)
classifier.fit(xtrain, y_train)

y_pred = classifier.predict(xtest)

# Print out some more metrics
print("Logistic regression before SMOTE")
from sklearn.metrics import classification_report
print(classification_report(y_test, classifier.predict(xtest)))

from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test, y_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

from sklearn.metrics import roc_auc_score
print("ROC_auc: ",roc_auc_score(y_test, y_pred))
print("Weighted F1_score: ", f1_score(y_test, y_pred, average='weighted'))

### B1.2 Logistic Regression after SMOTE

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# define dataset
X = final_df1
y = final_df.isFraud

# split train test set 0.8 om 0.2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# scale the data
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train) 
xtest = sc_x.transform(X_test)

# balance the data using SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(xtrain, y_train)

# create model
classifier2 = LogisticRegressionCV(cv =5, max_iter=10000,random_state = 0)
classifier2.fit(X_res, y_res)

y_pred1=classifier2.predict(xtest)

# Print out some more metrics
print("Logistic Regression after SMOTE")
from sklearn.metrics import classification_report
print(classification_report(y_test, classifier2.predict(xtest)))

from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test, y_pred1))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred1)
  
print ("Confusion Matrix : \n", cm)

from sklearn.metrics import roc_auc_score
print("ROC_auc: ",roc_auc_score(y_test, y_pred1))
print("Weighted F1_score: ", f1_score(y_test, y_pred1, average='weighted'))

## B2 Elastic Net

### B2.1 Elastic Net hyperparameter tuning

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

# define dataset
X = final_df1
y = final_df.isFraud

# split train test set 0.8 om 0.2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train) 
xtest = sc_x.transform(X_test)

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)

from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
# parameter grid
parameters = {
    'penalty' : ['elasticnet'], 
    'solver'  : ['saga'],
    'l1_ratio' : [0.1, 0.3, 0.5, 0.7, 0.9]
}

logreg = LogisticRegression()
clf = GridSearchCV(logreg,                    # model
                   param_grid = parameters,   # hyperparameters
                   scoring='accuracy',        # metric for scoring
                   cv=5)                     # number of folds

clf.fit(X_train,y_train)


print("Best score: %f using parameters %s" % (clf.best_score_, clf.best_params_))

### B2.2 Elastic Net model before SMOTE

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV

# define dataset
X = final_df1
y = final_df.isFraud

# split train test set 0.8 om 0.2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train) 
xtest = sc_x.transform(X_test)

# create model
from sklearn.linear_model import ElasticNetCV
classifier = LogisticRegressionCV(cv=5, max_iter=10000, penalty='elasticnet', l1_ratios=[0.1], solver='saga', random_state=0)
classifier.fit(xtrain, y_train)

y_pred = classifier.predict(xtest)

# Test the models
print(classifier.score(xtest, y_test))

# Print out some more metrics
print("Elastic Net before SMOTE")
from sklearn.metrics import classification_report
print(classification_report(y_test, classifier.predict(xtest)))

from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test, y_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

from sklearn.metrics import mean_squared_error
score = classifier.score(xtest, y_test)
mse = mean_squared_error(y_test, y_pred)
print("R2:{0:.3f}, MSE:{1:.2f}, RMSE:{2:.2f}"
      .format(score, mse, np.sqrt(mse)))

from sklearn.metrics import roc_auc_score
print("ROC_auc: ",roc_auc_score(y_test, y_pred))
print("Weighted F1_score: ", f1_score(y_test, y_pred, average='weighted'))

### B2.3 Elastic Net model after SMOTE

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV

# define dataset
X = final_df1
y = final_df.isFraud

# split train test set 0.8 om 0.2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# scale the data
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train) 
xtest = sc_x.transform(X_test)

# balance the data using SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(xtrain, y_train)

# create model
classifier2 = LogisticRegressionCV(cv=5, max_iter=10000, penalty='elasticnet', l1_ratios=[0.1], solver='saga', random_state=0)

#Fitting the training data
classifier2.fit(X_res, y_res)

y_pred1=classifier2.predict(xtest)

# Print out some more metrics
print("Elastic Net after SMOTE")
from sklearn.metrics import classification_report
print(classification_report(y_test, classifier2.predict(xtest)))

from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test, y_pred1))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred1)
  
print ("Confusion Matrix : \n", cm)

from sklearn.metrics import roc_auc_score
print("ROC_auc: ",roc_auc_score(y_test, y_pred1))
from sklearn.metrics import f1_score
print("Weighted F1_score: ", f1_score(y_test, y_pred1, average='weighted'))

## B3 Random Forest

### B3.1 Hyperparameter tuning Random Forest

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

# define dataset
X = final_df1
y = final_df.isFraud

# split train test set 0.8 om 0.2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train) 
xtest = sc_x.transform(X_test)

# Number of trees in random forest
n_estimators = [10,50,70,100]
# Number of features to consider at every split
max_features = ['log2', 'sqrt', None]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,3,5,8]


param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_leaf': min_samples_leaf,}
print(param_grid)

rf_Model = RandomForestClassifier()

from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rf_Model, param_grid = param_grid, cv = 5, verbose=2, n_jobs = 4)

rf_Grid.fit(xtrain, y_train)

print("Best score: %f using parameters %s" % (rf_Grid.best_score_, rf_Grid.best_params_))

### 3.2 Random Forest model before SMOTE

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# define dataset
X = final_df1
y = final_df.isFraud

# split train test set 0.8 om 0.2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# scale the data
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train) 
xtest = sc_x.transform(X_test)

# create model
classifier = RandomForestClassifier(max_features= None, min_samples_leaf= 1, n_estimators= 70,random_state=0)
classifier.fit(xtrain, y_train)

y_pred = classifier.predict(xtest)

# Print out some more metrics
print("Random forest before SMOTE")
from sklearn.metrics import classification_report
print(classification_report(y_test, classifier.predict(xtest)))

from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test, y_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

from sklearn.metrics import mean_squared_error

print("ROC_auc: ",roc_auc_score(y_test, y_pred))
print("Weighted F1_score: ", f1_score(y_test, y_pred, average='weighted'))
print()

### B3.3 Random Forest model after SMOTE

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# define dataset
X = final_df1
y = final_df.isFraud

# split train test set 0.8 om 0.2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# scale the data
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(X_train) 
xtest = sc_x.transform(X_test)

# balance the data using SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(xtrain, y_train)

# create model
classifier2 = RandomForestClassifier(max_features= None, min_samples_leaf= 1, n_estimators= 70,random_state=0)
classifier2.fit(X_res, y_res)

y_pred1=classifier2.predict(xtest)

# Print out some more metrics
print("Random forest after SMOTE")
from sklearn.metrics import classification_report
print(classification_report(y_test, classifier2.predict(xtest)))

from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test, y_pred1))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred1)
  
print ("Confusion Matrix : \n", cm)

print("ROC_auc: ",roc_auc_score(y_test, y_pred1))
print("Weighted F1_score: ", f1_score(y_test, y_pred1, average='weighted'))