In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Fraud Detection/dataset/transactions_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Fraud Detection/dataset/transactions_test.csv')

In [None]:
train_df.drop(['Unnamed: 0', 'nameOrig','nameDest'],axis=1,inplace=True)
test_df.drop(['Unnamed: 0','nameOrig','nameDest'],axis=1,inplace=True)

train_df = train_df.loc[(train_df.type=='TRANSFER')|(train_df.type=='CASH_OUT')]
test_df = test_df.loc[(test_df.type=='TRANSFER')|(test_df.type=='CASH_OUT')]

In [None]:
train_df['difOrig'] = train_df['amount'] + train_df['newbalanceOrig'] - train_df['oldbalanceOrig']
train_df['difDest'] = train_df['amount'] + train_df['oldbalanceDest'] - train_df['newbalanceDest']

test_df['difOrig'] = test_df['amount'] + test_df['newbalanceOrig'] - test_df['oldbalanceOrig']
test_df['difDest'] = test_df['amount'] + test_df['oldbalanceDest'] - test_df['newbalanceDest']

In [None]:
y = train_df['isFraud']
y_test = test_df['isFraud']

train_df.drop('isFraud',axis=1,inplace=True)
test_df.drop('isFraud',axis=1,inplace=True)

In [None]:
# Encode data
train_df['type'] = LabelEncoder().fit_transform(train_df['type'])
test_df['type'] = LabelEncoder().fit_transform(test_df['type'])

In [None]:
# Split train - validate data
x_train,x_val,y_train,y_val = train_test_split(train_df,y,test_size=0.2)

In [None]:
# SMOTE Technique after splitting
sm = SMOTE(ratio='minority', random_state=12)

xsm_train, ysm_train = sm.fit_sample(train_df, y)



In [None]:
# Fidning best model
model = {"Logistic Regression": LogisticRegression(),
         "KNN": KNeighborsClassifier(),
         "Random Forest": RandomForestClassifier(),
         "SGD": SGDClassifier()}

def fit_and_evaluate(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates machine learning models.
    models: a dictionary with machine learning models to be used
    X_train: training data (no labels)
    X_test: testing data (no labels)
    y_train: training labels
    y_test: testing labels
    """
    # Create an empty dictionary for model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
# model_scores = fit_and_evaluate(models = model,
#                                 X_train = xsm_train,
#                                 X_test = test_df,
#                                 y_train = ysm_train,
                                
#                                 y_test = y_test)
# model_scores

In [None]:
random_forest = RandomForestClassifier()
random_forest.fit(xsm_train,ysm_train)
# y_pred = random_forest.predict(x_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
def model_result(clf,x_val,y_val):
    y_prob = clf.predict_proba(x_val)
    y_pred = clf.predict(x_val)
    print("classification_report")
    print(classification_report(y_val,y_pred))

In [None]:
model_result(random_forest,test_df,y_test)

AUPRC : 1.0
F1 - score : 1.0
Confusion_matrix : 
[[315   0]
 [  0 700]]
accuracy_score
1.0
classification_report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       315
           1       1.00      1.00      1.00       700

    accuracy                           1.00      1015
   macro avg       1.00      1.00      1.00      1015
weighted avg       1.00      1.00      1.00      1015



In [None]:
# Zip model
import gzip

with gzip.open("/content/drive/MyDrive/Fraud Detection/model.pgz", "w") as f:
    pickle.dump(random_forest, f)