In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pickle

In [2]:
data = pd.read_csv('feat_data.csv')

In [3]:
data

Unnamed: 0,disrict,client_catg,region,target,1transactions_count,consommation_level_1_mean,consommation_level_2_mean,consommation_level_3_mean,consommation_level_4_mean,year,month,day,month_name
0,60,11,101,0.0,35,352.400000,10.571429,0.000000,0.000000,1994,12,31,December
1,69,11,107,0.0,37,557.540541,0.000000,0.000000,0.000000,2002,5,29,May
2,62,11,301,0.0,18,798.611111,37.888889,0.000000,0.000000,1986,3,13,March
3,69,11,105,0.0,20,1.200000,0.000000,0.000000,0.000000,1996,11,7,November
4,62,11,303,0.0,14,663.714286,104.857143,117.357143,36.714286,2014,10,14,October
...,...,...,...,...,...,...,...,...,...,...,...,...,...
135488,62,11,304,0.0,71,1.957746,0.000000,0.000000,0.000000,2004,7,26,July
135489,63,11,311,0.0,41,185.853659,0.756098,0.000000,0.000000,2012,10,25,October
135490,63,11,311,0.0,36,273.083333,0.000000,0.000000,0.000000,2011,11,22,November
135491,60,11,101,0.0,2,300.000000,70.500000,0.000000,0.000000,1993,12,22,December


In [4]:
X = data.drop('target',axis=1)
y = data.target

# Data Preparation

In [5]:
# Devide them to training, validation and test parts (60:20:20): 
X_train_full_df, X_test_df, y_train_full_df, y_test = train_test_split(X, y, test_size = 0.20, random_state = 2022)
X_train_df, X_val_df, y_train, y_val = train_test_split(X_train_full_df, y_train_full_df, test_size = 0.25, random_state = 2022)

# Vectorize feature matrices in the form of dictionary (with renewed indexes):
dv = DictVectorizer(sparse=False)

X_train_df = X_train_full_df.reset_index(drop=True)
X_train_dict = X_train_full_df.to_dict(orient='records')
X_train = dv.fit_transform(X_train_dict)

#X_val_df = X_val_df.reset_index(drop=True)
#X_val_dict = X_val_df.to_dict(orient='records')
#X_val = dv.fit_transform(X_val_dict)

X_test_df = X_test_df.reset_index(drop=True)
X_test_dict = X_test_df.to_dict(orient='records')
X_test = dv.fit_transform(X_test_dict)

# Renew the index of target variables
y_train = y_train_full_df.reset_index(drop=True)
#y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


# Random Forest Classifier

In [6]:
model = RandomForestClassifier(n_estimators=200,
                                    max_depth = 3,min_samples_leaf=3,
                                    random_state =2022)
model.fit(X_train,y_train)

y_pred = model.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred)

0.7254847397831794

# saving the model

In [7]:
import pickle

In [8]:
output_file = 'fraud_model.bin'

In [9]:
with open(output_file, 'wb') as f_out:
    
    pickle.dump((dv,model), f_out)

# Load the model

In [10]:
import pickle

In [11]:
model_file = 'fraud_model.bin'

In [12]:
with open(model_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)
    
    

In [13]:
dv, model

(DictVectorizer(sparse=False),
 RandomForestClassifier(max_depth=3, min_samples_leaf=3, n_estimators=200,
                        random_state=2022))

In [14]:
y_test

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
27094    0.0
27095    0.0
27096    0.0
27097    0.0
27098    0.0
Name: target, Length: 27099, dtype: float64

In [20]:
[X_val_df.iloc[132].to_dict]

[<bound method Series.to_dict of disrict                              60
 client_catg                          11
 region                              101
 1transactions_count                  34
 consommation_level_1_mean    442.735294
 consommation_level_2_mean           0.0
 consommation_level_3_mean           0.0
 consommation_level_4_mean           0.0
 year                               2007
 month                                11
 day                                  30
 month_name                     November
 Name: 46716, dtype: object>]