In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [14]:
train = pd.read_csv('./Data/cleaned_train_tag.csv', index_col=0)
test =  pd.read_csv('./Data/cleaned_test_tag.csv', index_col=0)

In [3]:
train.head()

Unnamed: 0,Gender,Age,Marital Status,Education,Years Worked,Commercial,Shareholder,Has App,Owns Car,Owns House,...,Overdue Loans,Overdue Days,Has Credit Cards,Credit Use,Debit Cards,Credit Cards,Debit,Credit,Debit Card Level,Flag
0,M,23,Married,Doctorate,0,0,0,0,0,0,...,0,0,0,0,0,1,-1,325,0,0
1,M,26,Married,Doctorate,0,0,0,1,0,0,...,0,0,0,0,0,1,-1,1683,0,0
2,M,46,Married,Doctorate,0,0,0,0,0,0,...,0,0,0,0,0,1,-1,466,0,0
3,M,30,Married,Doctorate,1,0,0,0,1,0,...,0,0,1,2,0,3,-1,167,0,0
4,M,30,Married,Doctorate,0,0,0,0,0,0,...,0,0,0,0,0,2,-1,1,0,0


In [15]:
test.head()

Unnamed: 0,Gender,Age,Marital Status,Education,Years Worked,Commercial,Shareholder,Has App,Owns Car,Owns House,...,Total Loans,Overdue Loans,Overdue Days,Has Credit Cards,Credit Use,Debit Cards,Credit Cards,Debit,Credit,Debit Card Level
0,F,39,Never Married,Upper Secondary,0,0,0,1,1,0,...,2,0,0,1,4,3,4,4378,683,20
1,F,40,Never Married,Masters,1,0,0,1,0,0,...,0,0,0,0,1,1,1,1078,1637,10
2,F,48,Never Married,Upper Secondary,0,0,1,1,1,1,...,0,0,0,1,5,5,3,6361,4809,20
3,M,47,Never Married,Bachelors,0,0,1,1,1,0,...,0,0,0,1,5,8,3,8032,5447,40
4,F,36,Never Married,Post-secondary,2,0,0,1,1,0,...,0,0,0,1,5,4,2,5885,1748,10


In [4]:
X = data.drop(labels='Flag', axis=1)
y = data['Flag']

In [5]:
X.head()

Unnamed: 0,Gender,Age,Marital Status,Education,Years Worked,Commercial,Shareholder,Has App,Owns Car,Owns House,...,Total Loans,Overdue Loans,Overdue Days,Has Credit Cards,Credit Use,Debit Cards,Credit Cards,Debit,Credit,Debit Card Level
0,M,23,Married,Doctorate,0,0,0,0,0,0,...,0,0,0,0,0,0,1,-1,325,0
1,M,26,Married,Doctorate,0,0,0,1,0,0,...,0,0,0,0,0,0,1,-1,1683,0
2,M,46,Married,Doctorate,0,0,0,0,0,0,...,0,0,0,0,0,0,1,-1,466,0
3,M,30,Married,Doctorate,1,0,0,0,1,0,...,0,0,0,1,2,0,3,-1,167,0
4,M,30,Married,Doctorate,0,0,0,0,0,0,...,0,0,0,0,0,0,2,-1,1,0


In [7]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Flag, dtype: int64

As we saw in the EDA, the dataset is imbalanced (There are less fraudulent cases than non-fraudulent) so to begin with 2 methods of dealing with this.
1. SMOTE (Synthetic Minority Over-sampling Technique)
2. Undersampling the majority class to match the size of the minority class

In [8]:
# Apply random undersampling to reduce the size of the majority class
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X, y)

In [None]:
kf = selection.KFold(n_splits=5, shuffle=True)
kf.get_n_splits(X)
fs
# Split the data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X_rus, y_rus, test_size=0.36)

# Create a logistic regression model and fit it to the training data
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

f1_lr = []

for train_index, test_index in kf.split(X):
    X_train, X_test = th.tensor(X[train_index]), th.tensor(X[test_index])
    y_train, y_test = y[train_index], y[test_index]
    lr = linear.LogisticRegression(solver='lbfgs')
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    probs = lr.predict_proba(X_test)[:,1]
    f1_lr.append(metrics.f1_score(y_test, predictions))
    
mean_lr = np.mean(f1_lr)
std_lr = np.std(f1_lr)
print('Mean F1 score is ', mean_lr, ' with a std. dev of ', std_lr)

SMOTE requires us to have all the categorical onehot encoded

In [9]:
gender_dummies = pd.get_dummies(data['Gender'])
education_dummies = pd.get_dummies(data['Education'])
marital_dummies = pd.get_dummies(data['Marital Status'])

In [10]:
X_1 = pd.concat([X.drop(labels=['Gender','Age','Marital Status','Education'], axis='columns'), 
                 gender_dummies, education_dummies, marital_dummies], axis = 'columns')
X_1.head()

Unnamed: 0,Years Worked,Commercial,Shareholder,Has App,Owns Car,Owns House,Wages Paid,Last Payment,Investment Risk,Total Asset Code,...,Bachelors,Doctorate,Masters,Post-secondary,Upper Secondary,Divorced,Married,Never Married,Separated,Widowed
0,0,0,0,0,0,0,0,-1,0,-1,...,0,1,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,-1,0,-1,...,0,1,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,-1,0,-1,...,0,1,0,0,0,0,1,0,0,0
3,1,0,0,0,1,0,0,-1,0,-1,...,0,1,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,-1,0,-1,...,0,1,0,0,0,0,1,0,0,0


In [11]:
# Apply SMOTE to oversample the minority class
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_1, y)