In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, RidgeClassifier #,Ridge,Lasso,ElasticNet
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score,f1_score
from sklearn.model_selection import train_test_split#,cross_val_score,KFold,RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import NearMiss
from sklearn.datasets import make_classification
import numpy as np
from collections import Counter

In [45]:
df = pd.read_parquet('fraud_detection.parquet')

In [46]:
df.drop(columns = ['SK_ID_CURR','AMT_GOODS_PRICE','CODE_GENDER'],inplace = True)

In [47]:
object_cols = df.select_dtypes(include=['object']).columns

In [48]:
df[object_cols] = df[object_cols].apply(lambda x: x.str.upper())

In [49]:
if (df['AMT_ANNUITY'].dtypes == 'int64') | (df['AMT_ANNUITY'].dtypes == 'float64'):
    print('x')

x


In [50]:
def drop_high_nulls_col(df):
    # Grouped median calculation for filling missing values
    
    for cols in df.columns:
        if (df[cols].notnull().sum() / df.shape[0]) < 0.4:
            df.drop(columns=[cols], inplace=True)
            
    return df

In [51]:
df = drop_high_nulls_col(df)

In [52]:
X_col = df.drop(columns = 'TARGET').columns
y_col = ['TARGET']

In [53]:
group_cols = ['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']

In [54]:
X_train,X_test,y_train,y_test = train_test_split(df[X_col],df[y_col],stratify=df[group_cols],train_size=0.8,test_size = 0.2,random_state=37)

In [55]:
X_train.to_parquet('X_train.parquet',index=False)
X_test.to_parquet('X_test.parquet',index=False)
y_train.to_parquet('y_train.parquet',index=False)
y_test.to_parquet('y_test.parquet',index=False)

In [56]:
def null_dict(df,grouped_cols):
    imputation_dict = {}
    # Calculate the sum of null values for each column
    null_sum = df.isnull().sum()

    # Filter columns where the sum of null values is greater than 0
    columns_with_nulls = null_sum[null_sum > 0].index

    # Select only the columns from the original DataFrame that have more than 0 null values
    null_cols = df.loc[:, columns_with_nulls].columns
    for col in null_cols:
        if (df[col].dtypes == 'int64') | (df[col].dtypes == 'float64'):
            imputation_dict.update({col : df.groupby(grouped_cols)[col].median().to_dict()})
        elif (df[col].dtypes == 'O'):
            imputation_dict.update({col : df.groupby(grouped_cols)[col].agg(pd.Series.mode).to_dict()})
                
    return imputation_dict

In [57]:
imput_dict = null_dict(X_train,group_cols)

In [58]:
def fillnulls(data,grouped_cols,imput_dict):
    imputation_dict = imput_dict
    
    col_x,col_y,col_z =grouped_cols
    
    for col in imputation_dict:
        for val in imputation_dict[col]:
            val_x,val_y,val_z = val
            data.loc[(data[col_x] == val_x) & (data[col_z] == val_z) & (data[col_z] == val_z),col] = imputation_dict[col][val]
    return data

In [59]:
X_train = fillnulls(X_train,group_cols,imput_dict)
X_test = fillnulls(X_test,group_cols,imput_dict)

In [60]:
object_cols = df.select_dtypes(include=['object']).columns

In [61]:
oneh = OneHotEncoder(sparse_output=False)
oneh.fit(X_train[object_cols])

In [62]:
col_names = oneh.get_feature_names_out()

In [63]:
X_train_dummies = pd.DataFrame(oneh.transform(X_train[object_cols]),columns = col_names)
X_train_dummies.index = X_train.index
X_train = pd.merge(X_train,X_train_dummies,left_index=True,right_index=True)

X_test_dummies = pd.DataFrame(oneh.transform(X_test[object_cols]),columns = col_names)
X_test_dummies.index = X_test.index
X_test = pd.merge(X_test,X_test_dummies,left_index=True,right_index=True)

In [64]:
X_train.drop(columns = object_cols,inplace = True)
X_test.drop(columns = object_cols,inplace = True)

In [65]:
def oversample_with_adasyn(X, y, target_minority_samples=100000):
    """
    Oversample the minority class using ADASYN until the minority class reaches the target_minority_samples.
    
    Parameters:
    - X: Feature matrix
    - y: Target vector
    - target_minority_samples: Desired number of samples for the minority class after oversampling
    
    Returns:
    - X_resampled, y_resampled: Feature matrix and target vector after resampling
    """
    # Identify minority class
    classes, counts = np.unique(y, return_counts=True)
    minority_class = classes[np.argmin(counts)]
    current_minority_samples = counts[np.argmin(counts)]

    # Calculate how many new samples we need to generate for the minority class
    if current_minority_samples >= target_minority_samples:
        raise ValueError("The minority class already has more or equal samples than the target_minority_samples.")
    
    n_samples_to_generate = target_minority_samples - current_minority_samples

    # Set the sampling strategy for ADASYN
    # This will tell ADASYN to generate the specified number of samples for the minority class
    sampling_strategy = {minority_class: n_samples_to_generate}

    # Create the ADASYN instance and fit it to the data
    adasyn = ADASYN(sampling_strategy=sampling_strategy, n_neighbors=5, random_state=42)
    X_resampled, y_resampled = adasyn.fit_resample(X, y)

    return X_resampled, y_resampled

In [24]:
def undersample_with_nearmiss_imbalanced(X, y, majority_samples=20000, minority_samples=10000):
    """
    Undersample the majority and minority classes separately using NearMiss.
    
    Parameters:
    - X: Feature matrix
    - y: Target vector
    - majority_samples: Desired number of samples for the majority class after undersampling
    - minority_samples: Desired number of samples for the minority class after undersampling
    
    Returns:
    - X_resampled, y_resampled: Feature matrix and target vector after resampling
    """
    # Identify majority and minority classes
    classes, counts = np.unique(y, return_counts=True)
    majority_class = classes[np.argmax(counts)]
    minority_class = classes[np.argmin(counts)]
    
    # Set the sampling strategy for NearMiss
    # This will balance the majority class with the minority class
    sampling_strategy = {majority_class: majority_samples, minority_class: minority_samples}
    nm = NearMiss(sampling_strategy=sampling_strategy)
    
    # Perform the undersampling
    X_resampled, y_resampled = nm.fit_resample(X, y)
    
    return X_resampled, y_resampled

# Perform undersampling with the specified numbers of samples
X_resampled, y_resampled = undersample_with_nearmiss_imbalanced(X_train, y_train, majority_samples=20000, minority_samples=10000)

In [37]:
X_train.shape

(30000, 51)

In [39]:
y_train.shape

(30000, 1)

In [38]:
X_resampled.shape

(303887, 51)

In [67]:
y_resampled.value_counts()

TARGET
0         226047
1          77840
Name: count, dtype: int64

In [66]:
X_resampled,y_resampled = oversample_with_adasyn(X_train,y_train)



In [26]:
X_train,y_train = undersample_with_nearmiss_imbalanced(X_resampled,y_resampled)

In [97]:
X_train.to_parquet('X_train_engineered.parquet',index=False)
X_test.to_parquet('X_test_engineered.parquet',index=False)
y_train.to_parquet('y_train_engineered.parquet',index=False)
y_test.to_parquet('y_test_engineered.parquet',index=False)

In [98]:
scaler = StandardScaler()
scaler.fit(X_train)

In [99]:
X_train_scaled = scaler.transform(X_train)
X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)

In [100]:
X_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

In [101]:
# Starting Logistic Regression
logr = LogisticRegression()

#Fitting Logistic Regression
logr.fit(X_train,y_train)

#Printing errors
print('Accuracy of training is '+ str(accuracy_score(y_train,logr.predict(X_train))))b
print('Precision of training is '+ str(precision_score(y_train,logr.predict(X_train))))
print('Recall of training is '+ str(recall_score(y_train,logr.predict(X_train))))
print('F1 of training is '+ str(f1_score(y_train,logr.predict(X_train))))
print('ROC AUC of training is '+ str(roc_auc_score(y_train,logr.predict(X_train))))
print()
print('Accuracy of test is '+ str(accuracy_score(y_test,logr.predict(X_test))))
print('Precision of test is '+ str(precision_score(y_test,logr.predict(X_test))))
print('Recall of test is '+ str(recall_score(y_test,logr.predict(X_test))))
print('F1 of test is '+ str(f1_score(y_test,logr.predict(X_test))))
print('ROC AUC of test is '+ str(roc_auc_score(y_test,logr.predict(X_test))))

SyntaxError: invalid syntax (244354690.py, line 8)

In [102]:
positive_class_probabilities = logr.predict_proba(X_test)[:, 1]

# Create a DataFrame to represent the predicted probabilities
probabilities_df = pd.DataFrame({
    'Actual Class': y_test.values.flatten(),  # Actual class (0 or 1)
    'Predicted Probability (Class 1)': positive_class_probabilities  # Predicted probability for class 1
})

# Display the first 10 rows of the DataFrame
print("Predicted Probabilities for the Positive Class:")
print(probabilities_df.head(10))

Predicted Probabilities for the Positive Class:
   Actual Class  Predicted Probability (Class 1)
0             0                         0.999979
1             0                         0.384625
2             0                         0.999985
3             0                         0.105336
4             0                         0.072132
5             0                         0.999997
6             0                         0.441318
7             0                         0.298207
8             0                         0.405350
9             0                         0.294848


In [108]:
negative_class_probabilities = logr.predict_proba(X_test)[:, 0]

# Create a DataFrame to represent the predicted probabilities
probabilities_df = pd.DataFrame({
    'Actual Class': y_test.values.flatten(),  # Actual class (0 or 1)
    'Predicted Probability (Class 0)': negative_class_probabilities  # Predicted probability for class 0
})

# Display the first 10 rows of the DataFrame
print("Predicted Probabilities for the Negative Class:")
print(probabilities_df.head())

Predicted Probabilities for the Negative Class:
   Actual Class  Predicted Probability (Class 0)
0             0                         0.000021
1             0                         0.615375
2             0                         0.000015
3             0                         0.894664
4             0                         0.927868


In [None]:
Accuracy of training is 0.7359333333333333
Precision of training is 0.7484980660027981
Recall of training is 0.9095
F1 of training is 0.8211818879508825
ROC AUC of training is 0.64915

Accuracy of test is 0.6085556802107215
Precision of test is 0.08430345782663262
Recall of test is 0.4004934210526316
F1 of test is 0.13928711880161596
ROC AUC of test is 0.5134584550839527

In [104]:
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]

for threshold in thresholds:
    # Classify based on the adjusted threshold
    y_pred_threshold = (positive_class_probabilities >= threshold).astype(int)
    
    # Calculate metrics
    precision = precision_score(y_test, y_pred_threshold)
    recall = recall_score(y_test, y_pred_threshold)
    f1 = f1_score(y_test, y_pred_threshold)
    
    # Print metrics for each threshold
    print(f"Threshold: {threshold}, Precision: {precision}, Recall: {recall}, F1-score: {f1}")

Threshold: 0.3, Precision: 0.07670182166826463, Recall: 0.2138157894736842, F1-score: 0.11290235032296586
Threshold: 0.4, Precision: 0.08291072060384479, Recall: 0.14453125, F1-score: 0.10537360413700067
Threshold: 0.5, Precision: 0.08953507474946608, Recall: 0.11204769736842106, F1-score: 0.09953428910601772
Threshold: 0.6, Precision: 0.09212827988338193, Recall: 0.09745065789473684, F1-score: 0.09471475671895294
Threshold: 0.7, Precision: 0.09390444810543658, Recall: 0.09375, F1-score: 0.09382716049382717


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga']  # Algorithm to use in optimization
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='recall')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate best model
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_pred_train)
train_precision = precision_score(y_train, y_pred_train)
train_recall = recall_score(y_train, y_pred_train)
train_f1 = f1_score(y_train, y_pred_train)
train_roc_auc = roc_auc_score(y_train, y_pred_train)

test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)
test_roc_auc = roc_auc_score(y_test, y_pred_test)

print("Train Metrics:")
print("Accuracy:", train_accuracy)
print("Precision:", train_precision)
print("Recall:", train_recall)
print("F1 Score:", train_f1)
print("ROC AUC Score:", train_roc_auc)

print("\nTest Metrics:")
print("Accuracy:", test_accuracy)
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1 Score:", test_f1)
print("ROC AUC Score:", test_roc_auc)