In [6]:
# Import Dependencies
import numpy as np
import pandas as pd
from glob import glob
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [1]:
# Merge Dataframes
list_csv = glob('/kaggle/input/ieee-fraud-detection/*')
train_id = list_csv[0]
train_transaction = list_csv[4]

df_train_transaction = pd.read_csv(train_transaction, low_memory=False)
df_train_identity = pd.read_csv(train_id, low_memory=False)
df_train = pd.merge(df_train_transaction, df_train_identity, on='TransactionID', how='left')

One of the first things I decided to do, was check how imbalanced this dataset is, considering that Fraud Detection datasets are known to have a high amount of imbalance between isFraud=0 and isFraud=1.

In [2]:
df_isfraud = df_train[df_train.isFraud==1]
df_notfraud = df_train[df_train.isFraud==0]

imbalance_percentage = len(df_isfraud)/len(df_notfraud)
print('class imbalance percentage: ' + str(imbalance_percentage))

class imbalance percentage: 0.03625870143908247


Wow! Only 3% of the rows of this dataframe are identified as isFraud=1, the other 97% is isFraud=0. Very imbalanced! Be careful to acknowledge that predictions can be dramatically biased based on this fact.

In [3]:
# Feature Selection based on the percentage of NA's in a columns
df_na = pd.DataFrame({"na_count":df_train.isnull().sum().sort_values(ascending=False)})
df_na["percentage"]=df_na["na_count"]/len(df_train)
limit=0.15 
selected_features = df_na[df_na["percentage"]<limit].index 
df = df_train[df_train.columns.intersection(selected_features)]

In [4]:
# Handling our Categorical Variables
# Determine which categorical variables should remain, and what they are.
list_categorical_columns = []
list_categorical_columns_remaining = df[df.select_dtypes(include=['object']).columns].columns.tolist()
print('Remaining Categorical Columns:\n' + ', '.join(list_categorical_columns_remaining))

# Create dummy variable dataframes to encode categorical variables into numbers
for categorical_column_name in list_categorical_columns_remaining:
    list_categorical_columns.append(categorical_column_name)

# Concatenate our dummy dataframes into our primary dataframe, 
# then get rid of the non-encoded variables
df_dummies = pd.get_dummies(df[list_categorical_columns])
df = pd.concat([df, df_dummies], axis=1)
df = df._get_numeric_data()

Remaining Categorical Columns:
ProductCD, card4, card6


In [7]:
# training our model
df2 = df.copy()
df2.fillna(value=df2.median(), inplace=True)
X = np.array(df2.drop(['isFraud'], axis=1))
X = preprocessing.scale(X)
y = np.array(df2['isFraud'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

classifier_logistic = LogisticRegression(solver='lbfgs')
classifier_logistic.fit(X_train, y_train)
confidence_logistic = classifier_logistic.score(X_test, y_test)
print('\tLogistic Confidence: ' + str(confidence_logistic))

	Logistic Confidence: 0.9677921901988011


