In [1]:
# santander Customer Transaction Prediction

In [None]:
# Loading Libraries
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
# import eli5

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from pdpbox import pdp, info_plots
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, roc_curve, auc

random_state = 42
np.random.seed(random_state)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set working directory
os.chdir("E:/JUPYTER/Data Science/Project 5")

In [None]:
os.getcwd()

In [None]:
# importing the train dataset
df_train = pd.read_csv("train.csv")
pd.options.display.max_columns = None

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
# describe the data
df_train.describe()

In [None]:
# target class count
target_class = df_train['target'].value_counts()
print('COUNT OF THE TARGET CLASS :\n', target_class)

# percentage of the target class count
per_target_class = df_train['target'].value_counts()/len(df_train)*100
print('PERCENTAGE OF THE TARGET CLASS COUNT :\n',per_target_class)

In [None]:
# count plot & violin plot for target class
fig,ax = plt.subplots(1, 2, figsize = (20,5))
sns.countplot(df_train.target.values, ax = ax[0], palette = 'spring')
sns.violinplot(x = df_train_target.values, y = df_train.index.values, ax = ax[1], palette = 'spring')
sns.stripplot(x = df_train_target.values, y = df_train.index.values, jitter = True, color = 'black', linewidth = 0.5, size = 0.5, ax = ax[1], palette = 'spring')
ax[0].set_xlabel('Target')
ax[1].set_xlabel('Target')
ax[1].set_ylabel('Index')

In [None]:
# Observation
# we are having a unbalanced data, where 90% of the data isno. of customers who will make transaction & 10% of the data are those who will nit make a transaction.
# from violin plots, it seems that there is no realation between the target and index of the dataframe, it is more dominated by zero compared to ones.
# from jitter plots with violin plots, we can observe that target looks uniformly distributed over indexes of the dataframe.

In [None]:
# Distribution of the train attributes

In [None]:
%%time

# Distribution of the train attributes

def plot_train_attribute_distribution(t0, t1, label1, label2, train_attributes):
    i = 0
    sns.set_style('darkgrid')

    fig = plt.figure()
    ax = plt.subplots(10, 10, figsize=(22,18))

    for attribute in train_attributes:
        i += 1
        plt.subplot(10, 10, i)
        sns.distplot(t0[attribute], hist = False, label = label1)
        sns.distplot(t1[attribute], hist = False, label = label2)
        plt.legend()
        plt.xlabel('Attribute',)
        sns.set_style("ticks", {"xticks.major.size" : 8, "yticks.major.size" : 8})
    plt.show()

In [None]:
# observing first 100 train attributes

In [None]:
%%time
# corresponding to negative class
t0 = df_train[df_train.target.values == 0]


# corresponding to negative class
t1 = df_train[df_train.target.values == 1]

# train attributes from 2 to 102
train_attributes = df_train.columns.values[2:102]

# plot distribution of the train attribute
plot_train_attribute_distribution(t0, t1, '0', '1', train_attributes)

In [None]:
# plotting next 100 train attributes

# train attributes from 2 to 102
train_attributes = df_train.columns.values[102:202]

# plot distribution of the train attribute
plot_train_attribute_distribution(t0, t1, '0', '1', train_attributes)

In [None]:
# observation : We can observe that there is a considerable number of features which have significantly different distribution. for eg var_0, var_1, var_6, var_109 etc.
# there are also connsiderable number of features which have same distribution. for eg var_101, var_4, var_5 etc

In [None]:
# importing the test dataset

In [None]:
df_test = pd.read_csv("test.csv")

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
%%time

# Distribution of test attributes

def plot_test_attribute_distribution(test_attributes):
    i = 0
    sns.set_style('darkgrid')

    fig = plt.figure()
    ax = plt.subplots(10, 10, figsize=(22,18))

    for attribute in test_attributes:
        i += 1
        plt.subplot(10, 10, i)
        sns.distplot(df_test[attribute], hist = False)
        plt.xlabel('Attribute',)
        sns.set_style("ticks", {"xticks.major.size" : 8, "yticks.major.size" : 8})
    plt.show()

In [None]:
# observing first 100 test attributes

In [None]:
# test attributes from 1 to 101
test_attributes = df_test.columns.values[1:101]

# plot distribution of the test attribute
plot_test_attribute_distribution(test_attributes)

In [None]:
# plotting next 100 test attributes

# train attributes from 2 to 102
test_attributes = df_test.columns.values[102:202]

# plot distribution of the train attribute
plot_test_attribute_distribution(train_attributes)

In [None]:
# Distribution of mean values per rows and columns

In [None]:
%%time
# Distribution of Mean Value per column in train and test dataset
plt.figure(figsize=(16,8))

# train attributes
train_attributes = df_train.columns.values[2:202]

# test attributes
test_attributes = df_test.columns.values[1:201]

# Distribution plot for mean values per column in train attributes
sns.distplot(df_train[train_attributes].mean(axis = 0), color = 'red', kde = True, bins = 150, label = 'train')

# Distribution plot for mean values per column in test attributes
sns.distplot(df_test[test_attributes].mean(axis = 0), color = 'blue', kde = True, bins = 150, label = 'test')

plt.title('Distribution of Mean Values per Column in Train and Test Dataset')
plt.legend()
plt.show()

# Distribution of Mean Values per row in train and test dataset
plt.figure(figsize=(16,8))

# Distribution plot for mean values per row in train attributes
sns.distplot(df_train[train_attributes].mean(axis = 1), color = 'red', kde = True, bins = 150, label = 'train')

# Distribution plot for mean values per row in test attributes
sns.distplot(df_test[test_attributes].mean(axis = 1), color = 'blue', kde = True, bins = 150, label = 'test')

plt.title('Distribution of Mean Values per Row in Train and Test Dataset')
plt.legend()
plt.show()

In [None]:
# Distribution of Standard Deviation Values per rows and columns

In [None]:
%%time
# Distribution of Standard Deviation per column in train and test dataset
plt.figure(figsize=(16,8))

# train attributes
train_attributes = df_train.columns.values[2:202]

# test attributes
test_attributes = df_test.columns.values[1:201]

# Distribution plot for S.D. values per column in train attributes
sns.distplot(df_train[train_attributes].std(axis = 0), color = 'red', kde = True, bins = 150, label = 'train')

# Distribution plot for S.D. values per column in test attributes
sns.distplot(df_test[test_attributes].std(axis = 0), color = 'blue', kde = True, bins = 150, label = 'test')

plt.title('Distribution of Standard Deviation Values per Column in Train and Test Dataset')
plt.legend()
plt.show()

# Distribution of S.D. Values per row in train and test dataset
plt.figure(figsize=(16,8))

# Distribution plot for S.D. values per row in train attributes
sns.distplot(df_train[train_attributes].std(axis = 1), color = 'red', kde = True, bins = 150, label = 'train')

# Distribution plot for S.D. values per row in test attributes
sns.distplot(df_test[test_attributes].std(axis = 1), color = 'blue', kde = True, bins = 150, label = 'test')

plt.title('Distribution of Standard Deviation Values per Row in Train and Test Dataset')
plt.legend()
plt.show()


In [None]:
# Dsitribution of Kurtosis Values per Rows and Columns

In [None]:
%%time
# Distribution of Kurtosis per column in train and test dataset
plt.figure(figsize=(16,8))

# train attributes
train_attributes = df_train.columns.values[2:202]

# test attributes
test_attributes = df_test.columns.values[1:201]

# Distribution plot for kurtosis values per column in train attributes
sns.distplot(df_train[train_attributes].kurtosis(axis = 0), color = 'red', kde = True, bins = 150, label = 'train')

# Distribution plot for kurtosis values per column in test attributes
sns.distplot(df_test[test_attributes].kurtosis(axis = 0), color = 'blue', kde = True, bins = 150, label = 'test')

plt.title('Distribution of Kurtosis Values per Column in Train and Test Dataset')
plt.legend()
plt.show()

# Distribution of kurtosis Values per row in train and test dataset
plt.figure(figsize=(16,8))

# Distribution plot for kurtosis values per row in train attributes
sns.distplot(df_train[train_attributes].kurtosis(axis = 1), color = 'red', kde = True, bins = 150, label = 'train')

# Distribution plot for kurtosis values per row in test attributes
sns.distplot(df_test[test_attributes].kurtosis(axis = 1), color = 'blue', kde = True, bins = 150, label = 'test')

plt.title('Distribution of Kurtosis Values per Row in Train and Test Dataset')
plt.legend()
plt.show()


In [None]:
# missing value analysis%%time
# find the missing value in train and test dataset
train_missing = df_train.isnull().sum().sum()
test_missing = df_test.isnull().sum().sum()

print("MISSING VALUES IN TRAIN DATASET : ",train_missing)
print("MISSING VALUES IN TEST DATASET : ",test_missing)

In [None]:
# Observation - No missing value is present in both train and test

In [None]:
# correlation between attributes

In [None]:
%%time
# correlation in train attribute
train_attributes  =df_train.columns.values[2:202]
train_correlation = df_train[train_attributes].corr().abs().unstack().sort_values(kind = 'quicksort').reset_index()
train_correlation = train_correlation[train_correlation['level_0'] != train_correlation['level_1']]
print(train_correlation.head(10))
print(train_correlation.tail(10))

In [None]:
%%time
# correlation in test attribute
test_attributes  =df_test.columns.values[2:202]
test_correlation = df_test[test_attributes].corr().abs().unstack().sort_values(kind = 'quicksort').reset_index()
test_correlation = test_correlation[test_correlation['level_0'] != test_correlation['level_1']]
print(test_correlation.head(10))
print(test_correlation.tail(10))

In [None]:
# obersvation - correlation amongst the train and the test attributes are very small

In [None]:
# correlation plot for train and test dataset

In [None]:
train_correlation = df_train[train_attributes].corr()
train_correlation = train_correlation.values.flatten()
train_correlation = train_correlation[train_correlation != 1]

test_correlation = df_test[test_attributes].corr()
test_correlation = test_correlation.values.flatten()
test_correlation = test_correlation[test_correlation != 1]

plt.figure(figsize = (20, 5))
sns.distplot(train_correlation, color = 'blue', label = 'train')
sns.distplot(test_correlation, color = 'red', label = 'test')
plt.xlabel("CORRELATION VALUES FOUND IN TRAN AND TEST DATA")
plt.ylabel("DENSITY")
plt.title("CORRELATION VALUES IN TRAIN AND TEST DATA")
plt.legend()

In [None]:
# correlation values found in train and test data are very small, its completely visible from the graph

In [None]:
# Feature engineering - performing feature engineering by using permutations importance - partial dependence plot

In [None]:
# training and testing data
x = df_train.drop(columns = ['ID_code', 'target'], axis = 1)
test = df_test.drop(columns = ['ID_code'], axis = 1)
y = df_train['target']

In [None]:
# build a simple model to find the features which are more important

In [None]:
# split the train data
x_train, y_train, x_test, y_test = train_test_split(x, y, random_state = 42)

In [None]:
# random forest classifier

In [None]:
x_train = x_train[:len(y_train)]
print(x_train.shape)

In [None]:
print(y_train[:10])  # Inspect first 10 values
print(y_train.dtypes)  # Check the data type

In [None]:
threshold = 0.5  # Example threshold value
y_train = (y_train > threshold).astype(int)  # Convert to binary classes (0 or 1)

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestClassifier(n_estimators = 10, random_state = 42)

# fitting the model
rf_model.fit(x_train, y_train)

In [None]:
# calculating weights and observing some important features via using eli5 library. eli5 is a python library which helps to debug machine learning classifiers and explain their predictions

In [None]:
from sklearn.inspection import permutation_importance

# Train the model
rf_model.fit(x_train, y_train)

# Compute permutation importance
result = permutation_importance(rf_model, x_train, y_train, scoring='accuracy')

# Visualize results
sorted_idx = result.importances_mean.argsort()
plt.barh(range(x_train.shape[1]), result.importances_mean[sorted_idx], align='center')
plt.yticks(range(x_train.shape[1]), [x_train.columns[i] for i in sorted_idx])
plt.xlabel('Permutation Importance')
plt.title('Feature Importance via Permutation')
plt.show()


In [None]:
# observation - features having highest to lowest importance is in descending order as show in the prediction. Features show in green having highe positive impact on prediction and features having zero impact on the color.
# partial dependence plots - pdp gives a graphical depiction marginal effect of a variable on a class probably or classification. it shows how a feature effects predictions

In [None]:
# handling of imbalanced data - multiple approaches can be used for dealing with it.
# 1 change of performance matrix
# 2 oversample minority class
# 3 undersample majority class
# 4 SMOTE(synthetic minority oversampling techniques)
# 5 change of algorithm

# logistic regression model

In [None]:
# sppliting the data with stratified k fold cross validator
# traing data
X = df_train.drop(['ID_code', 'target'], axis = 1)
Y = df_train['target']

# stratified k fold cross validator
skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
for train_index, valid_index in skf.split(X,Y):
    x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = Y.iloc[train_index], Y.iloc[valid_index]

print("SHAPE OF x_train : ",x_train.shape)
print("SHAPE OF x_valid : ",x_valid.shape)
print("SHAPE OF y_train : ",y_train.shape)
print("SHAPE OF y_valid : ",y_valid.shape)

In [None]:
%%time
lr_model = LogisticRegression(random_state = 42)
# fiiting the model
lr_model.fit(x_train, y_train)

In [None]:
# Accuracy of the model
lr_score = lr_model.score(x_train, y_train)
print("ACCURACY OF THE lr_model : ", lr_score)

In [None]:
%%time
# cross validation prediction of the lr_model
cv_predict = cross_val_predict(lr_model, x_valid, y_valid, cv = 5)
# cross validation score
cv_score = cross_val_score(lr_model, x_valid, y_valid, cv = 5)
print("CROSS VALIDATION SCORE : ", cv_score)

In [None]:
unique, counts = np.unique(cv_predict, return_counts = True)
dict(zip(unique, counts))

In [None]:
# we wont be using accuracy as the performance metric because we cant apply it on an imbalanced dataset

In [None]:
# confusion matrix

In [None]:
cm = confusion_matrix(y_valid, cv_predict)
cm = pd.crosstab(y_valid, cv_predict)
cm

In [None]:
# obersvation - on comparing roc_auc_score and model accuracy, model is not perfoming on the imbalanced data

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Compute ROC curve and AUC
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_valid, cv_predict)
roc_auc = auc(false_positive_rate, true_positive_rate)

# Plot ROC curve
plt.figure()
plt.title('RECEIVER OPERATING CHARACTERISTIC (ROC)')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='ROC (area = %0.3f)' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')  # Dashed diagonal line
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('RECALL (TRUE POSITIVE RATE)')
plt.xlabel('FALSE POSITIVE RATE')
plt.show()

# Print AUC
print('AUC :', roc_auc)


In [None]:
# classification report
classification_scores = classification_report(y_valid, cv_predict)
print(classification_scores)

In [None]:
# obervation - as we see that f1 score is high for the customer who will not make a transaction compare to those who will make a transaction, so we are going to change the algorithm.

In [None]:
# Correct syntax for dropping a column
x_test = df_test.drop('ID_code', axis=1)

# Predict on the test dataset
lr_pred = lr_model.predict(x_test)

# Print predictions
print(lr_pred)


In [None]:
# type markdown and latex a2

In [None]:
# oversample minority class
# Oversample Minority Class

# • Adding more copies of minority class

# It can be a good option we don't have that much large data to work

# • Drawback of this process is that we are adding information which may lead to overfitting or poor performance on test data.
# Undersample Majority Class

# • Removing some copies of majority class

# • It can be a very good option if we have very large amount of data say in millions to work.
# Drawback of this process is we are removing some valuable information, this can lead to underfitting and poor perfromance on test data.

# As per the drawbacks of both the model we will use SMOTE (Synthetic Minority Oversampling Technique) that is more better than above.

# SMOTE (Synthetic Minority Oversampling Technique) - This is a statistical technique for increasing the number of cases in your dataset in a b

# uses a nearest neighbours algorithm to generate new and synthetic data to use for training the model

In [None]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
sm = SMOTE(random_state=42, sampling_strategy=1.0)

# Generating synthetic datapoints for training and validation sets
x_smote, y_smote = sm.fit_resample(x_train, y_train)
x_smote_v, y_smote_v = sm.fit_resample(x_valid, y_valid)

print("Shape of x_smote:", x_smote.shape)
print("Shape of y_smote:", y_smote.shape)
print("Shape of x_smote_v:", x_smote_v.shape)
print("Shape of y_smote_v:", y_smote_v.shape)


In [None]:
# building logistic regression model on synthetic datapoinhts 

In [None]:
%%time
# logistic regression model for SMOTE
smote = LogisticRegression(random_state = 42)
# fitting the smote model
smote.fit(x_smote, y_smote)

In [None]:
LogisticRegression(random_state = 42)

In [None]:
# accuracy of the model
smote_score = smote.score(x_smote,y_smote)
print("ACCURACY OF THE SMOTE_MODEL : ", smote_score)

In [None]:
%%time
# cross validation prediction for smote
cv_pred = cross_val_predict(smote, x_smote_v, y_smote_v, cv = 5)
# cross validation score
cv_score = cross_val_score(smote, x_smote_v, y_smote_v, cv = 5)
print("CROSS VALIDATION SCORE : ", cv_score)

In [None]:
%%time
# confusion matrix
cm = confusion_matrix(y_smote_v, cv_pred)
cm = pd.crosstab(y_smote_v, cv_pred)
cm

In [None]:
# roc auc score
roc_score = roc_auc_score(y_smote_v, cv_pred)
print("ROC SCORE : ", roc_score)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Ensure `cv_pred` contains predicted probabilities
# cv_pred_prob = model.predict_proba(x_smote_v)[:, 1]  # Probabilities for the positive class

# Compute ROC curve and AUC
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_smote_v, cv_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)

# Plot ROC curve
plt.figure()
plt.title('RECEIVER OPERATING CHARACTERISTIC (ROC)')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='ROC (area = %0.3f)' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')  # Dashed diagonal line
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('RECALL (TRUE POSITIVE RATE)')
plt.xlabel('FALSE POSITIVE RATE')
plt.show()

# Print AUC
print('AUC:', roc_auc)


In [None]:
# classification report

In [None]:
scores = classification_report(y_smote_v, cv_pred)
print(scores)

In [None]:
# we can observe that theb f1 score is high for the customer who didnt make the transaction as well as the customer who made the transaction

In [None]:
# model performance on the test data

In [None]:
%%time
# predicting the model
x_test = df_test.drop(['ID_code'], axis = 1)
smote_pred = smote.predict(x_test)
print(smote_pred)

In [None]:
# observation - we can observe that smote model is performing better than simple logistic regression

In [None]:
# light GBM - it is a gradient boosting framework that uses the time based algorithm

In [None]:
# trainging data
lgb_train = lgb.Dataset(x_train, label = y_train)
# validatng data
lgb_valid = lgb.Dataset(x_valid, label = y_valid)

In [None]:
# selecting the hyperparameter by tuning of differnet parameters
params = {
    'boosting_type' : 'gbdt',
    'max_depth' : -1,
    'objective' : 'binary',
    'boost_from_average' : False,
    'nthread' : 20,
    'metric' : 'auc',
    'num_leaves' : 50,
    'learning_rate' : 0.01,
    'max_bin' : 100,
    'subsample for bin' : 100,
    'subsample' : 1,
    'subsample_freq' : 1,
    'colsample_bytree' : 0.8,
    'bagging_fraction' : 0.5,
    'bagging_freq' : 5,
    'feature_fraction' : 0.08,
    'min_split_gain' : 0.45,
    'min_child_weight' : 1,
    'min_child_samples' : 5,
    'is_unbalanced' : True,
}

In [None]:
import lightgbm as lgb

# Prepare the datasets
lgb_train = lgb.Dataset(x_train, y_train)
lgb_valid = lgb.Dataset(x_valid, y_valid, reference=lgb_train)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'seed': 42
}

# Train the model with early stopping
num_rounds = 10000
lgbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=num_rounds,
    valid_sets=[lgb_train, lgb_valid],  # Specify training and validation datasets
    callbacks=[
        lgb.early_stopping(stopping_rounds=5000),  # Early stopping
        lgb.log_evaluation(period=1000)  # Log evaluation metrics every 1000 rounds
    ]
)

# Print the model
print(lgbm)


In [None]:
x_test = df_test.drop(['ID_code'], axis = 1)
# predict the model
# #probability predictions
# Predict without specifying iterations
lgbm_predict_prob = lgbm.predict(x_test)

# Convert probabilities to binary predictions
lgbm_predict = np.where(lgbm_predict_prob > 0.5, 1, 0)

print(lgbm_predict_prob)
print(lgbm_predict)


In [None]:
# plotting the importance of features

In [None]:
lgb.plot_importance(lgbm, max_num_features = 50, importance_type = "split", figsize = (20,50))

In [None]:
# final submission
df_sub = pd.DataFrame({'ID_code' : df_test['ID_code'].values})
df_sub['lgbm_predict_prob'] = lgbm_predict_prob
df_sub['lgbm_predict'] = lgbm_predict
df_sub.to_csv('SUBMISSION.csv', index = False)
df_sub.head()