In [12]:
import pandas as pd
import numpy as np
import re

# machine learning
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn2pmml import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn_pandas import DataFrameMapper

final_columns = ['cardCompany' ,'cardNetwork' ,'cardType' ,'transAmount' ,'transChannel' ,'transCurrency' ,'cardExpDate']

train_df = pd.read_json('../input/transactions_data.txt',lines=True)
train_df['fraud'] = train_df['fraud'].map({False:0,True:1})
train_df.loc[train_df.query('fraud == 0').sample(frac=.1).index,'fraud'] = 1
x=train_df.loc[:,train_df.columns!='fraud']
y=train_df.loc[:,'fraud']

class CustomFilter(BaseEstimator,TransformerMixin):
    def __init__(self, col_list):
        self.col_list = col_list
        
    def transAmountNames(self,name):
        if name <= 0.796:
            return 0
        elif (name > 0.796) & (name <= 1812.738):
            return 1
        elif (name > 1812.738) & (name <= 2925.726):
            return 2
        else:
            return 3

    def companyNames(self,name):
        p1 = re.compile('[a-zA-Z]+-[a-zA-Z]+')
        p2 = re.compile('[a-zA-Z]+, [a-zA-Z]+ and [a-zA-Z]+')
        p3 = re.compile('[a-zA-Z]+ Inc')
        p4 = re.compile('[a-zA-Z]+ Group')
        p5 = re.compile('[a-zA-Z]+ LLC')
        p6 = re.compile('[a-zA-Z]+ and Sons')
        if p1.match(name):
            return name.split('-')[0]    
        elif p2.match(name):
            return name.split(',')[0]
        elif p3.match(name) or p4.match(name) or p5.match(name) or p6.match(name):
            return name.split(' ')[0]
        else:
            return name

    def getCardExpYear(self,date):
        return date.split('/')[1]        

    def transform(self, X, **transform_params):
        result = []
        for index, rowdata in X.iterrows():
            rowdict = {}
            for col in self.col_list:
                if col=="cardCompany":
                    rowdict.update( {col: self.companyNames(rowdata[col]) } )
                elif col =="cardExpDate":
                    rowdict.update( {"cardExpYear": self.getCardExpYear(rowdata[col]) } )
                elif col =="transAmount":
                    rowdict.update( {col: self.transAmountNames(rowdata[col]) } )                    
                else:
                    rowdict.update( {col: rowdata[col] } )
            result.append(rowdict)        
        return pd.DataFrame(result)
    
    def fit(self, X, y=None, **fitparams):
        return self
    
class MyLabelEncoder(TransformerMixin):
    def __init__(self):
        self.encoder = LabelEncoder()
    def fit(self, x, y=0):
        return self
    def transform(self, x, y=0):
        return x.apply(self.encoder.fit_transform)    
 
column_preprocessor = DataFrameMapper([
    ("cardType",LabelEncoder()),
    ("cardNetwork",LabelEncoder()),
    ("transChannel",LabelEncoder()),
    ("transCurrency",LabelEncoder())
])

#D = pd.DataFrame([['Runolfsson-Runolfsson','visa','credit',2019.212078,'MOBILE','pound','06/20'],['KR-KR','visa','credit',2019.212078,'MOBILE','pound','06/20']], columns = ['cardCompany','cardNetwork','cardType','transAmount','transChannel','transCurrency','cardExpDate']);    
#PMMLPipeline([("input_filter",CustomFilter(final_columns)),("label_encoder",MyLabelEncoder())]).transform(D)
model1 = PMMLPipeline([("input_filter",column_preprocessor),("rfc",ensemble.RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=3))])

In [13]:
model1.fit(x,y)

PMMLPipeline(steps=[('input_filter', DataFrameMapper(default=False, df_out=False,
        features=[('cardType', LabelEncoder()), ('cardNetwork', LabelEncoder()), ('transChannel', LabelEncoder()), ('transCurrency', LabelEncoder())],
        input_df=False, sparse=False)),
       ('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [14]:
model1.predict_proba(pd.DataFrame([['Runolfsson-Runolfsson','visa','credit',2019.212078,'MOBILE','yen','06/20'],['KR-KR','visa','credit',2019.212078,'MOBILE','pound','06/20']], columns = ['cardCompany','cardNetwork','cardType','transAmount','transChannel','transCurrency','cardExpDate']))

array([[0.89945419, 0.10054581],
       [0.90158163, 0.09841837]])

In [16]:
sklearn2pmml(model1, "D:/rfc1.pmml")