In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
# loading data train and test data
# Find data at https://www.kaggle.com/c/ieee-fraud-detection/data

In [None]:
train_transaction = pd.read_csv("train_transaction.csv")
train_identity = pd.read_csv("train_identity.csv")
train = pd.merge(train_transaction,train_identity,on=['TransactionID'],how='left')
test_transaction = pd.read_csv("test_transaction.csv")
test_identity = pd.read_csv("test_identity.csv")
test = pd.merge(test_transaction,test_identity,on=['TransactionID'],how='left')
print(train.head())

In [None]:
# Handling Missing Value Both in Train & Test Dataset
train = train.dropna(axis=1)
print(train.columns)
features_col = list(train.columns)
features_col.remove('isFraud')
test = test[features_col]
print(train.isnull().sum())


In [None]:
# Exploratory  Analysis Started and Data PreProcessing
#separate variables into new numeric and categorical Features
numeric_data = train.select_dtypes(include=[np.number])
cat_data = train.select_dtypes(exclude=[np.number])
del numeric_data['TransactionID']

In [None]:
# correlation plot using SeaBorn
corr = numeric_data.corr()
plt.subplot()
sns.heatmap(corr)
#plt.show()
plt.close()

print (corr['isFraud'].sort_values(ascending=False)[:10], '\n') #top 15 values
print ('----------------------')
print (corr['isFraud'].sort_values(ascending=False)[-5:]) #last 5 values
print('Data was successfully merged!\n')

del train_identity, train_transaction, test_identity, test_transaction

print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

In [None]:
# New feature - day of week in which a transaction happened.
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)

# New feature - hour of the day in which a transaction happened.
train['Transaction_hour'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour'] = np.floor(test['TransactionDT'] / 3600) % 24

# Bar Plot
pd.pivot_table(train, index=['Transaction_hour'], values = ['isFraud'],aggfunc= np.sum).plot(kind='Bar')  # 0 Monday
# #plt.show()
plt.close()

In [None]:
# Label Encoding For Categorical Data
for col in train.columns:
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))


In [None]:
# Dropping Redundant Columns
X_train = train.drop(columns=['isFraud','TransactionDT'])
y_train = train['isFraud']
test = test.drop(columns=['TransactionDT'])
X_test = test.fillna(-999)
# Exploratory Analysis and Data pre processing Completed 


In [None]:
# Fit the model on training data using a fit method
logreg = LogisticRegression()
model = logreg.fit(X_train,y_train)
print(model)
# The predict method just takes X_test as a parameter, which means it just takes the features to draw predictions
predictions = logreg.predict(X_test)
# Below are the results of predicted is Fraud
print(predictions[0:20])
# Results save on test Data
X_test['Predicted'] = predictions
X_test.to_csv("fraud_predicted_on_test_data.csv",index=False)