In [2]:
import pandas as pd
import numpy as np

In [None]:
balanced = pd.read_csv('train_transaction.csv')

In [2]:
balanced.shape

(590540, 394)

# Make Hour feature

In [3]:
def make_hour_feature(df, tname='TransactionDT'):
    """
    Creates an hour of the day feature, encoded as 0-23. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    tname : str
        Name of the time column in df.
    """
    hours = df[tname] / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

In [4]:
balanced['hours'] = make_hour_feature(balanced)

# Create binary columns for nan values

In [5]:
cols = balanced.columns.values

for i in range(len(cols)):
    binary_col_name = cols[i] + '_b'
#     print(binary_col_name)
    balanced[binary_col_name] = np.where(balanced[cols[i]].notnull(), 0, 1)
    
balanced.shape

(590540, 790)

In [6]:
pd.set_option('display.max_rows', 150)
print("Column-wise missing value percentage")
print(balanced.isnull().sum()/len(balanced)*100)

Column-wise missing value percentage
TransactionID      0.000000
isFraud            0.000000
TransactionDT      0.000000
TransactionAmt     0.000000
ProductCD          0.000000
card1              0.000000
card2              1.512683
card3              0.265012
card4              0.267044
card5              0.721204
card6              0.266028
addr1             11.126427
addr2             11.126427
dist1             59.652352
dist2             93.628374
P_emaildomain     15.994852
R_emaildomain     76.751617
C1                 0.000000
C2                 0.000000
C3                 0.000000
C4                 0.000000
C5                 0.000000
C6                 0.000000
C7                 0.000000
C8                 0.000000
C9                 0.000000
C10                0.000000
C11                0.000000
C12                0.000000
C13                0.000000
C14                0.000000
D1                 0.214888
D2                47.549192
D3                44.514851
D4         

# Drop NA columns that have more than 1% NaNs

In [7]:
df_transact_train = balanced.copy()

low_null_columns = (df_transact_train.isnull().sum()/len(df_transact_train)*100) < 1

low_nan_columns = []
index = 0
for column in low_null_columns:
    if column == True:
        low_nan_columns.append(low_null_columns.index[index])
    index += 1
    
df_low_nans = df_transact_train.loc[:, low_nan_columns]
df_no_nans = df_low_nans.dropna()

df_transact_train = df_no_nans

Trans_RAW = df_transact_train

Trans_FE = df_transact_train
Trans_FE['TransactionID'] = df_transact_train['TransactionID']
Trans_FE['isFraud'] = df_transact_train['isFraud']
Trans_RAW.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(584843, 507)

# Rebalance data set to 50% fraud, 50% non fraud

In [8]:
from random import sample

mask = Trans_RAW['isFraud'] == 1

Trans_RAW_Fraud = Trans_RAW[mask]
Trans_RAW_Non = Trans_RAW[-mask]

Trans_RAW_NonFraud = Trans_RAW_Non.sample(n = 20394)

df_train_balanced = pd.concat([Trans_RAW_NonFraud, Trans_RAW_Fraud])

print (len(Trans_RAW_Fraud))
print (len(Trans_RAW_NonFraud))
print (len(df_train_balanced))

20394
20394
40788


In [10]:
df_train_balanced.to_csv('train data 2.0.csv')

# Prepoccessing

In [3]:
training = pd.read_csv('train data 2.0.csv')

In [5]:
# card6: type of card; we drop the 3 unique rows of 'debit or credit' and 'charge card'
training['card6'].value_counts()
mask = training['card6'] != 'debit or credit'
training = training[mask]
mask2 = training['card6'] != 'charge card'
training = training[mask2]
training.shape

(40787, 508)

In [6]:
# create into category type
training['card6'] = training['card6'].astype('category')
training['card4'] = training['card4'].astype('category')
training['ProductCD'] = training['ProductCD'].astype('category')

# encode into numeric categories; we opt for dummy variables (One Hot Encoding)
training_dummies = pd.get_dummies(training, columns=['card4', 'card6', 'ProductCD'])

In [7]:
from sklearn import preprocessing

# initialize scaler
scaler = preprocessing.MinMaxScaler()

# drop target column
training_v1 = training_dummies.drop('TransactionID', axis=1)
training_v2 = training_v1.drop('isFraud', axis=1)
training_y = training_dummies['isFraud'].copy()
training_id = training_dummies['TransactionID'].copy()

# Fit the data with scaler (scales data)
training_v2 = training_v2.set_index('Unnamed: 0')
x = training_v2.values
x_scaled = scaler.fit_transform(x)
training_scaled = pd.DataFrame(x_scaled)
training_scaled = training_scaled.set_axis([x for x in training_v2.columns], axis=1, inplace=False)
training_scaled.insert(loc=0, column='TransactionID', value=training_id)
training_scaled['y'] = training_y
training_scaled = training_scaled.dropna()
training_scaled['TransactionID'] = training_scaled['TransactionID'].astype(int)
training_scaled.shape

(40786, 515)

In [8]:
training_scaled.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card3,card5,C1,C2,C3,C4,...,card4_mastercard,card4_visa,card6_credit,card6_debit,ProductCD_C,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,y
0,3215526,0.339061,0.344635,0.018635,0.381679,0.693431,0.000214,0.000176,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3546914,0.938032,0.014778,0.918502,0.381679,0.919708,0.001068,0.000351,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3384313,0.630945,0.009769,0.239489,0.381679,0.014599,0.000854,0.000351,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3464568,0.780488,0.00687,0.16173,0.381679,0.868613,0.000214,0.000176,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3046448,0.081188,0.005531,0.078679,0.381679,0.481752,0.000641,0.000527,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


# Train test split and random forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# initialize Random Forest model and split data into training and validation
clf = RandomForestClassifier()
y = training_scaled['y'].copy()
training_scaled = training_scaled.drop(['y'], axis=1)
x = training_scaled.drop(['TransactionID'], axis=1)
x_train, x_val, y_train, y_val = train_test_split(x, y, random_state=42)

In [11]:
clf = RandomForestClassifier(max_depth=30, min_samples_split=7, n_estimators=200)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=30, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=7,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(x_val)
accuracy_score(y_val, y_pred)

0.8278905560458959

In [15]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(clf, x_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

array([[12853,  2433],
       [ 3177, 12126]], dtype=int64)

In [16]:
from sklearn.metrics import precision_score, recall_score
print(precision_score(y_train, y_train_pred))
recall_score(y_train, y_train_pred)

0.8328868740984958


0.7923936483042541

In [18]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                    index = x_train.columns,
                                    columns= ['importance']).sort_values('importance',ascending=False)
feature_importances[:25]

Unnamed: 0,importance
TransactionAmt,0.0374
TransactionDT,0.036712
C13,0.036271
C14,0.031904
C1,0.029339
card1,0.029143
C2,0.02431
hours,0.021152
C5,0.019976
D1,0.017492
