In [3]:
import pandas as pd
df = pd.read_csv('train_transaction.csv')

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# List of columns to apply label encoding
columns_to_encode = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
                     'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

# Apply label encoding for each column
for column in columns_to_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))

# Calculate the number of missing values per column
missing_values_per_column = df.isnull().sum()

# Calculate the median of missing values
median_missing_values = missing_values_per_column.median()

# Exclude columns with missing values surpassing the median
columns_to_exclude = missing_values_per_column[missing_values_per_column > median_missing_values].index
df_cleaned = df.drop(columns=columns_to_exclude)

# Impute missing values using median of respective columns
df_cleaned.fillna(df_cleaned.median(), inplace=True)

fraud_proportion = df['isFraud'].mean()

print("Proportion of fraud cases in the dataset:", fraud_proportion)

# Now df_cleaned contains the preprocessed dataset with label encoded categorical columns and missing values imputed


Proportion of fraud cases in the dataset: 0.03499000914417313


In [5]:
df_cleaned.head

<bound method NDFrame.head of         TransactionID  isFraud  TransactionDT  TransactionAmt  ProductCD  \
0             2987000        0          86400           68.50          4   
1             2987001        0          86401           29.00          4   
2             2987002        0          86469           59.00          4   
3             2987003        0          86499           50.00          4   
4             2987004        0          86506           50.00          1   
...               ...      ...            ...             ...        ...   
590535        3577535        0       15811047           49.00          4   
590536        3577536        0       15811049           39.50          4   
590537        3577537        0       15811079           30.95          4   
590538        3577538        0       15811088          117.00          4   
590539        3577539        0       15811131          279.95          4   

        card1  card2  card3  card4  card5  ...   V312    

In [6]:
x=df_cleaned.drop(columns=['isFraud'])
y=df_cleaned['isFraud']

In [7]:
print(len(df_cleaned))
print(len(x),len(y))

590540
590540 590540


In [8]:
from imblearn.over_sampling import SMOTE

In [9]:
smote=SMOTE(sampling_strategy='minority')
x_sm,y_sm=smote.fit_resample(x,y)

In [10]:
y.value_counts()

isFraud
0    569877
1     20663
Name: count, dtype: int64

In [11]:
y_sm.value_counts()

isFraud
0    569877
1    569877
Name: count, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x_sm,y_sm,test_size=.2,stratify=y_sm)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=-1, n_neighbors=5, p=2, weights='uniform')
knn.fit(xtrain,ytrain)
ypre_knn=knn.predict(xtest)
print("Knn report\n",classification_report(ytest,ypre_knn,digits=6))

report
               precision    recall  f1-score   support

           0   0.951557  0.802773  0.870856    113975
           1   0.829442  0.959132  0.889585    113976

    accuracy                       0.880952    227951
   macro avg   0.890500  0.880952  0.880220    227951
weighted avg   0.890499  0.880952  0.880220    227951



In [14]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report

lda = LinearDiscriminantAnalysis(covariance_estimator=None, n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001)
lda.fit(xtrain, ytrain)
ypre_lda = lda.predict(xtest)
print("lda report\n", classification_report(ytest, ypre_lda))


lda report
               precision    recall  f1-score   support

           0       0.82      0.85      0.84    113975
           1       0.84      0.82      0.83    113976

    accuracy                           0.83    227951
   macro avg       0.83      0.83      0.83    227951
weighted avg       0.83      0.83      0.83    227951



In [15]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(xtrain,ytrain)
ypre_lr=lr.predict(xtest)

mse = mean_squared_error(ytest, ypre_lr)
mae = mean_absolute_error(ytest, ypre_lr)


print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 0.1302631217265497
Mean Absolute Error (MAE): 0.29476044818808395


In [16]:
mvl=sum(ypre_lr)/len(ypre_lr)
por=[0]*len(ytest)
for i in range(0,len(ypre_lr)):
    if(ypre_knn[i]==0 or ypre_lda[i]==0):
        if(ypre_lr[i]<mvl):
            por[i]=0
    elif(ypre_knn[i]==1 or ypre_lda[i]==1):
        if(ypre_lr[i]>mvl):
            por[i]=1
    else:
        por[i]=ypre_knn[i]

In [17]:
acc=accuracy_score(ytest,por)
pcc=precision_score(ytest,por)
ff=f1_score(ytest,por)
re=recall_score(ytest,por)

In [18]:
print("acc : ",acc)
print("pre : ",pcc)
print("f1  : ",ff)
print("re  : ",re)

acc :  0.8771578102311461
pre :  0.9603152506799735
f1  :  0.8649607932022261
re  :  0.7868323155752088


In [19]:
print(classification_report(ytest,por))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89    113975
           1       0.96      0.79      0.86    113976

    accuracy                           0.88    227951
   macro avg       0.89      0.88      0.88    227951
weighted avg       0.89      0.88      0.88    227951



In [20]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(xtrain,ytrain)
ydt=clf.predict(xtest)

print("DT report\n", classification_report(ytest, ydt))

DT report
               precision    recall  f1-score   support

           0       0.98      0.98      0.98    113975
           1       0.98      0.98      0.98    113976

    accuracy                           0.98    227951
   macro avg       0.98      0.98      0.98    227951
weighted avg       0.98      0.98      0.98    227951



In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(xtrain, ytrain)
yrf=clf.predict(xtest)

print("RT report\n", classification_report(ytest, yrf))

RT report
               precision    recall  f1-score   support

           0       0.80      0.84      0.82    113975
           1       0.83      0.79      0.81    113976

    accuracy                           0.81    227951
   macro avg       0.81      0.81      0.81    227951
weighted avg       0.81      0.81      0.81    227951



In [22]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf.fit(xtrain, ytrain)
yet=clf.predict(xtest)
print("ET report\n", classification_report(ytest, yet))

ET report
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    113975
           1       1.00      0.99      0.99    113976

    accuracy                           0.99    227951
   macro avg       0.99      0.99      0.99    227951
weighted avg       0.99      0.99      0.99    227951



In [23]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=0)
clf.fit(xtrain, ytrain)
yab=clf.predict(xtest)

print("AB report\n", classification_report(ytest, yab))

AB report
               precision    recall  f1-score   support

           0       0.94      0.96      0.95    113975
           1       0.96      0.94      0.95    113976

    accuracy                           0.95    227951
   macro avg       0.95      0.95      0.95    227951
weighted avg       0.95      0.95      0.95    227951

