In [1]:
# Importing libraries

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import pickle
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)

# Loading Dataset which is CSV file in my Drive
# Connecting to the drive for dataset

df_train = pd.read_csv('/content/fraudTest.csv', low_memory=False, index_col=0)
df_test = pd.read_csv('/content/fraudTrain.csv', low_memory=False, index_col=0)

# Prints the shapes of the training and test DataFrames to check the number of rows and columns.

df_train.shape
df_test.shape

(1296675, 22)

In [2]:

# Checks if the columns in the training and test DataFrames are the same.

list(df_train.columns) == list(df_test.columns)

# Prints the shape of the combined DataFrame.

df = pd.concat([df_train, df_test],ignore_index=True)
df.shape

(1852394, 22)

In [3]:
# Counts the occurrences of each unique value in the 'is_fraud' column, providing insight into the class distribution.

df.is_fraud.value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

In [4]:
# specified columns from the DataFrame, removing unnecessary features.

def clean_df(df):
    return df.drop(['cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],axis=1)

# clean the DataFrame and prints the first two rows.

df = clean_df(df)
df.head(2)

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,fraud_Kirlin and Sons,personal_care,2.86,M,33.9659,-80.9355,333497,Mechanical engineer,1371816865,33.986391,-81.200714,0
1,fraud_Sporer-Keebler,personal_care,29.84,F,40.3207,-110.436,302,"Sales professional, IT",1371816873,39.450498,-109.960431,0


In [5]:
# Prints information about the DataFrame, including data types and non-null values, to understand the dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 12 columns):
 #   Column      Dtype  
---  ------      -----  
 0   merchant    object 
 1   category    object 
 2   amt         float64
 3   gender      object 
 4   lat         float64
 5   long        float64
 6   city_pop    int64  
 7   job         object 
 8   unix_time   int64  
 9   merch_lat   float64
 10  merch_long  float64
 11  is_fraud    int64  
dtypes: float64(5), int64(3), object(4)
memory usage: 169.6+ MB


In [6]:
# Splits the DataFrame into training and testing sets using train_test_split.

train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((1481915, 12), (370479, 12))

In [7]:
# Defines a function encode to label encode categorical columns in the DataFrame

def encode(df):
    df_obj = df.select_dtypes(include=['object'])
    encoders = {}
    for col in df_obj.columns:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        encoders[col] = encoder
    with open('LE_mdl_v1.pkl', 'wb') as f:
        pickle.dump(encoders, f)
    return df

train = encode(train)
train.head(2)

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,638,9,1.15,0,32.153,-90.1217,19685,197,1351901098,31.495807,-89.902654,0
1,43,2,65.97,0,34.2651,-77.867,186140,178,1327718138,34.922678,-77.34116,0


In [8]:
x = train.drop(columns=['is_fraud'])
y = train['is_fraud']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=42)

In [9]:
# Initializes three machine learning models: Logistic Regression, Random Forest, and Decision Tree.

model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = DecisionTreeClassifier()

In [10]:
# Defines a function model_train to train a given model, make predictions, print accuracy and classification report

def model_train(model, x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print('Accuracy Score: ',accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    with open(str(model)[:3] + '_mdl.pkl', 'wb') as f:
        pickle.dump(model,f)

In [11]:
#  LogisticRegression
model_train(model1, x_train, y_train, x_val, y_val)

Accuracy Score:  0.9948849967778179
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    294867
           1       0.00      0.00      0.00      1516

    accuracy                           0.99    296383
   macro avg       0.50      0.50      0.50    296383
weighted avg       0.99      0.99      0.99    296383



In [12]:
# DecisionTreeClassifier
model_train(model3, x_train, y_train, x_val, y_val)

Accuracy Score:  0.9961198854185294
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    294867
           1       0.61      0.65      0.63      1516

    accuracy                           1.00    296383
   macro avg       0.81      0.82      0.81    296383
weighted avg       1.00      1.00      1.00    296383



In [14]:
# RandomForestClassifier
model_train(model2, x_train, y_train, x_val, y_val)

Accuracy Score:  0.9975066046298202
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    294867
           1       0.84      0.64      0.72      1516

    accuracy                           1.00    296383
   macro avg       0.92      0.82      0.86    296383
weighted avg       1.00      1.00      1.00    296383

