In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter('ignore')
df = pd.read_csv('Purchase_Fraud_Data.csv')
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,ip_address,class,category,dob
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,65278,QVPSPJUOCKZAR,SEO,Chrome,M,732758400.0,0,home_essentials,22-2-1976
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,96399,EOGFQPIZPYXFZ,Ads,Chrome,F,350311400.0,0,apparels,2-1-1962
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,57296,YSSKYOSJHPPLJ,SEO,Opera,M,2621474000.0,1,electronics,5-3-1962
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,43650,ATGTXKYKUDUQN,SEO,Safari,M,3840542000.0,0,health_care,3-7-1974
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,45016,NAUITBZFJKHWW,Ads,Safari,M,415583100.0,0,home_essentials,25-8-1970


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             146185 non-null  object 
 8   ip_address      151112 non-null  float64
 9   class           151112 non-null  int64  
 10  category        151112 non-null  object 
 11  dob             146188 non-null  object 
dtypes: float64(1), int64(3), object(8)
memory usage: 13.8+ MB


In [3]:
df['class'].value_counts(normalize=True)*100

0    90.635423
1     9.364577
Name: class, dtype: float64

### <font color='green'>As we can see above that given dataset is unbalanced because the percentage for class 0 and 1 is not equal</font>

In [4]:
df.isna().sum()

user_id              0
signup_time          0
purchase_time        0
purchase_value       0
device_id            0
source               0
browser              0
sex               4927
ip_address           0
class                0
category             0
dob               4924
dtype: int64

### <font color='green'>As we can see above  columns: 'sex' and 'dob' of the dataset have the null values</font>

In [5]:
df.describe()

Unnamed: 0,user_id,purchase_value,ip_address,class
count,151112.0,151112.0,151112.0,151112.0
mean,200171.04097,50521.469003,2152145000.0,0.093646
std,115369.285024,28533.667117,1248497000.0,0.291336
min,2.0,1016.0,52093.5,0.0
25%,100642.5,25919.0,1085934000.0,0.0
50%,199958.0,50484.0,2154770000.0,0.0
75%,300054.0,75296.25,3243258000.0,0.0
max,400000.0,100092.0,4294850000.0,1.0


In [6]:
df.describe(include=['O'])

Unnamed: 0,signup_time,purchase_time,device_id,source,browser,sex,category,dob
count,151112,151112,151112,151112,151112,146185,151112,146188
unique,151112,150663,137956,3,5,2,5,13745
top,2015-02-24 22:55:49,2015-09-10 09:04:53,CQTUVBYIWWWBC,SEO,Chrome,M,electronics,24-7-1982
freq,1,3,20,60615,61432,85445,32056,37


In [7]:
len(df)==len(df['user_id'].unique())

True

### <font color='green'>Based on above overview, we can see that there is  missing entries for the features: sex and dob and these are not very insightful later we can remove them, all the user ids are unique in the table. In terms of the percentage of fraudulent activities, current mean rate of fraudulent transactions is 9.36%.</font>

## <font color='blue'>The target variable is class, and there are some features in the table that we can play with for the purpose of feature engineering, this include:</font>
### <font color='green'>Calculate time difference between sign-up time and purchase time;</font>
### <font color='green'>Check whether different users use the same device, which could indicate fake accounts;</font>
### <font color='green'>Check whether different users have the same IP address, which could also be an indicator;</font>
### <font color='green'>Explore other temporal patterns, such as week of the year and day of the week.</font>

In [8]:
df.head(1)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,ip_address,class,category,dob
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,65278,QVPSPJUOCKZAR,SEO,Chrome,M,732758400.0,0,home_essentials,22-2-1976


In [9]:
# Conversion of datatypes for columns: 'signup_time' and 'purchase_time' to datetime64
df['purchase_time'] = pd.to_datetime(df['purchase_time'])
df['signup_time'] = pd.to_datetime(df['signup_time'])

In [10]:
# Create the time difference column
df['time_diff']=(df['purchase_time']-df['signup_time']).apply(lambda x:x.days)

In [12]:
df.groupby('device_id')['user_id'].count().reset_index().rename


<bound method DataFrame.rename of             device_id  user_id
0       AAALBGNHHVMKG        1
1       AAAWIHVCQELTP        1
2       AAAXJHWCLISKY        1
3       AAAXXOZJRZRAO       11
4       AABFGRPBQHWFQ        1
...               ...      ...
137951  ZZZGSIJRNCXBJ        1
137952  ZZZIKLJSVSQMF        1
137953  ZZZKJIZHJEDFN        1
137954  ZZZMVOGBAJVTM        1
137955  ZZZXASJUVUNMV        1

[137956 rows x 2 columns]>

In [13]:
# Check whether different users use the same device and create a new column to count repeated device use
Temp1=df.groupby('device_id')['user_id'].count().reset_index().rename(columns={'user_id':'device_count'})
df=df.merge(Temp1,how='left',on='device_id')

In [14]:
# Check whether different users use the same IP address and create a new column to count repeated IP use
Temp2=df.groupby('ip_address')['user_id'].count().reset_index().rename(columns={'user_id':'ip_count'})
df=df.merge(Temp2,how='left',on='ip_address')

In [15]:
# Create week of year and day of week columns for sign time and purchase time
df['signup_week']=df['signup_time'].apply(lambda x:x.week)
df['signup_day']=df['signup_time'].apply(lambda x:x.dayofweek)
df['purchase_week']=df['purchase_time'].apply(lambda x:x.week)
df['purchase_day']=df['purchase_time'].apply(lambda x:x.dayofweek)

In [16]:
# Show the new data table
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,ip_address,class,category,dob,time_diff,device_count,ip_count,signup_week,signup_day,purchase_week,purchase_day
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,65278,QVPSPJUOCKZAR,SEO,Chrome,M,732758400.0,0,home_essentials,22-2-1976,52,1,1,9,1,16,5
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,96399,EOGFQPIZPYXFZ,Ads,Chrome,F,350311400.0,0,apparels,2-1-1962,0,1,1,23,6,24,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,57296,YSSKYOSJHPPLJ,SEO,Opera,M,2621474000.0,1,electronics,5-3-1962,0,12,12,1,3,1,3
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,43650,ATGTXKYKUDUQN,SEO,Safari,M,3840542000.0,0,health_care,3-7-1974,5,1,1,18,1,19,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,45016,NAUITBZFJKHWW,Ads,Safari,M,415583100.0,0,home_essentials,25-8-1970,50,1,1,30,1,37,2


### <font color='green'>Now, we may select what features to include in our machine learning model. We will drop user_id, device_id and ip_address as the three columns are just identifiers and don't contain much useful information in themselves. We have also engineered new features from time-related features, therefore, we can also drop signup_time and purchase_time.</font>

In [None]:
columns = ['user_id','signup_time','purchase_time','device_id','ip_address']
df_new=df.drop(columns,axis=1)
df_new.head()

In [None]:
# Pre-check correlation between features and our target variable 'class' using heatmap.
fig,ax=plt.subplots(figsize=(10,8))
sns.heatmap(df_new.corr(), annot=True, cmap='YlGnBu', vmin=-1, vmax=1, cbar=True, linewidths=0.5)
plt.show()

### <font color='blue'>As shown above, device count and ip count have high correlation with the target.</font>

### <font color='green'>Next, we will move on to pick and apply a machine learning model. I will use H2O random forest model here because: random forest requires very little time to optimize (its default parameters are often close to the best ones), and it is strong with both continuous and discrete variables. H2O random forest can handle categorical variables without the need to convert them to numerical ones, which will make things easier for us in this case.</font>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, classification_report

In [None]:
#df_new['source']= df_new['source'].astype("category")

In [None]:
df_new.info()

In [None]:
# Define features and target
feature = ['purchase_value', 'source', 'browser', 'sex', 'time_diff', 'device_count', 'ip_count',
           'signup_week', 'signup_day', 'purchase_week','purchase_day']
target = 'class'
X=df_new[feature]
y=df_new[target]

In [None]:
# Split the data into 70% training and 30% test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
X_train.dtypes

In [None]:
features_to_encode = list(X_train.select_dtypes(include = ['object']).columns)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
col_trans = make_column_transformer(
                        (OneHotEncoder(),features_to_encode),
                        remainder = "passthrough"
                        )

In [None]:
#build the classifier
rf_classifier = RandomForestClassifier(
                      min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=50,
                      max_features='auto')

In [None]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(col_trans, rf_classifier)
pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_curve, auc, classification_report,accuracy_score
#test the classifier
y_test_pred = pipe.predict(X_test)

#calculate accuracy
accuracy_score(y_test, y_test_pred)
#accuracy_score(y_train, pipe.predict(X_train))

In [None]:
# Print out classification report
print(classification_report(y_test, (y_test_pred > 0.5).astype(int)))

In [None]:
y_train_pred = pipe.predict(X_train)
train_fpr, train_tpr,_=roc_curve(y_train,y_train_pred)
train_auc = np.round(auc(train_fpr, train_tpr),3)
test_fpr, test_tpr, _ = roc_curve(y_test, y_test_pred)
test_auc = np.round(auc(test_fpr, test_tpr), 3)

In [None]:
# Plot ROC curve and AUC
train_fpr = np.insert(train_fpr, 0, 0)
train_tpr = np.insert(train_tpr, 0, 0)
test_fpr = np.insert(test_fpr, 0, 0)
test_tpr = np.insert(test_tpr, 0, 0)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(train_fpr, train_tpr, label='Train AUC: ' + str(train_auc))
ax.plot(test_fpr, test_tpr, label='Test AUC: ' + str(test_auc))
ax.plot(train_fpr, train_fpr, 'k--', label='Chance Curve')
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.grid(True)
ax.legend(fontsize=12)
plt.show()

### <font color='blue'>As shown in above classification report and ROC curve, the recall rate of label 1 is 0.67, meaning out of true fraudulent samples, 67% are correctly predicted. Since this challenge is about classification of fraudulent activities, it is critical to correctly classify every fraudulent activity in order to minimize the risk. This means we may need to reduce the number of false negatives (fraudulent but incorrectly classified as non-fraudulent) by lowering the threshold from the default 0.5 level, even if this may result in more false positives (non-fraudulent activities classfied as fraudulent).</font>

In [None]:
# Plot feature importance
rf_fit = rf_classifier.fit(X_train, y_train)
feature_importances = rf_fit.best_estimator_._final_estimator.feature_importances_

fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(x='scaled_importance',y='variable',data=feature_importances)
plt.show()