# Machine Learning 

In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection  import train_test_split
from sklearn import metrics

In [2]:
df = pd.read_csv("new_df.csv", sep = ",")

In [3]:
df.columns

Index(['purchase_value', 'source', 'browser', 'sex', 'age', 'class', 'country',
       'continent', 'duration', 'device_count'],
      dtype='object')

In [4]:
df.dtypes

purchase_value    float64
source             object
browser            object
sex                object
age                 int64
class               int64
country            object
continent          object
duration          float64
device_count        int64
dtype: object

In [5]:
# Change object to category variables 
numeric_df = df
for i in numeric_df.columns:
    if numeric_df[i].dtypes == 'object':
        numeric_df[i] = numeric_df[i].astype('category').cat.codes

In [6]:
numeric_df

Unnamed: 0,purchase_value,source,browser,sex,age,class,country,continent,duration,device_count
0,34.0,2,0,1,39,0,84,1,52.160671,1
1,16.0,0,0,0,53,0,171,3,0.207685,1
2,15.0,2,3,1,53,1,171,3,0.000012,12
3,44.0,2,4,1,41,0,172,6,5.695428,1
4,39.0,0,4,1,45,0,171,3,50.479873,1
5,42.0,0,0,1,18,0,32,3,49.084850,1
6,11.0,0,0,0,19,0,172,6,25.206308,1
7,27.0,0,3,1,34,0,171,3,49.406586,1
8,30.0,2,2,0,43,0,36,1,41.599537,1
9,62.0,0,2,1,31,0,171,3,57.219363,1


In [7]:
numeric_df.dtypes

purchase_value    float64
source               int8
browser              int8
sex                  int8
age                 int64
class               int64
country             int16
continent            int8
duration          float64
device_count        int64
dtype: object

In [8]:
# define X and y 
y = numeric_df['class'].astype('category').cat.codes
X = numeric_df.loc[:,numeric_df.columns !='class']

## Logit Regression

- LogisticRegressionCV(): can use k-fold, but can not produce model with inner method. 
- sm.Logit(): k-fold?, can produce model with model.summary()


### Model 1 - full dataset 

In [9]:
# Logit regression 
import statsmodels.api as sm
from sklearn import metrics



In [10]:
X['intercept'] = 1.0 
# random sampling 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=109)

In [11]:
# model
logit_model = sm.Logit(y_train,X_train).fit()
pred_prob = logit_model.predict(X_test)
print("Model:",logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.188835
         Iterations 7
Model:                            Logit Regression Results                           
Dep. Variable:                      y   No. Observations:               120889
Model:                          Logit   Df Residuals:                   120879
Method:                           MLE   Df Model:                            9
Date:                Tue, 21 Jan 2020   Pseudo R-squ.:                  0.3913
Time:                        10:20:46   Log-Likelihood:                -22828.
converged:                       True   LL-Null:                       -37505.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
purchase_value     0.0002      0.001      0.254      0.799      -0.001       0.002
source    

In [12]:
X_test.loc[:,'prediction']=0
X_test.loc[pred_prob > 0.5,'prediction']=1
print(pd.crosstab(y_test,X_test['prediction'],rownames =['actual'],colnames=['predicted']))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


predicted      0     1
actual                
0          27226   132
1           1320  1545


Accuracy:

In [13]:
print(metrics.accuracy_score(y_test, X_test['prediction']))

0.9519571187506204


Precision:

In [14]:
print(metrics.precision_score(y_test, X_test['prediction']))

0.9212880143112702


recall:

In [15]:
print(metrics.recall_score(y_test, X_test['prediction']))

0.5392670157068062


F1-score

In [16]:
print(metrics.f1_score(y_test, X_test['prediction']))

0.6803170409511229


The model performs not well, as recall rate is relatively low.  

## Model 2 - reduced dataset  - 7 variables 

In [17]:
#Backward selection
""" 
ref: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
    
>>> from sklearn.datasets import make_friedman1
>>> from sklearn.feature_selection import RFE
>>> from sklearn.svm import SVR
>>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
>>> estimator = SVR(kernel="linear")
>>> selector = RFE(estimator, 5, step=1)
>>> selector = selector.fit(X, y)
>>> selector.support_
array([ True,  True,  True,  True,  True, False, False, False, False,
       False])
>>> selector.ranking_
array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
"""

' \nref: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html\n    \n>>> from sklearn.datasets import make_friedman1\n>>> from sklearn.feature_selection import RFE\n>>> from sklearn.svm import SVR\n>>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n>>> estimator = SVR(kernel="linear")\n>>> selector = RFE(estimator, 5, step=1)\n>>> selector = selector.fit(X, y)\n>>> selector.support_\narray([ True,  True,  True,  True,  True, False, False, False, False,\n       False])\n>>> selector.ranking_\narray([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])\n'

In [18]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
selector = RFE(estimator,8,step =1)
selector = selector.fit(X,y)
print(X.columns)
selector.ranking_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Index(['purchase_value', 'source', 'browser', 'sex', 'age', 'country',
       'continent', 'duration', 'device_count', 'intercept'],
      dtype='object')


array([2, 1, 1, 1, 1, 3, 1, 1, 1, 1])

So we decide to abandon purchase value and country

In [19]:
X1 = X[['source', 'browser', 'sex', 'age', 'continent', 'duration',
       'device_count', 'intercept']]

In [20]:
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.20,random_state=109)
logit_model2 = sm.Logit(y_train,X1_train).fit()
pred_prob2 = logit_model2.predict(X1_test)
X1_test.loc[:,'prediction'] = 0 
X1_test.loc[pred_prob2 > 0.5,'prediction'] = 1

Optimization terminated successfully.
         Current function value: 0.188839
         Iterations 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [21]:
print(logit_model2.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:               120889
Model:                          Logit   Df Residuals:                   120881
Method:                           MLE   Df Model:                            7
Date:                Tue, 21 Jan 2020   Pseudo R-squ.:                  0.3913
Time:                        10:21:03   Log-Likelihood:                -22829.
converged:                       True   LL-Null:                       -37505.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
source          -0.0031      0.015     -0.205      0.838      -0.033       0.027
browser          0.0036      0.009      0.385      0.700      -0.015       0.022
sex              0.0447      0.028      1.62

In [22]:
print(pd.crosstab(y_test,X1_test['prediction'],rownames =['actual'],colnames=['predicted']))

predicted      0     1
actual                
0          27226   132
1           1320  1545


Based on Confusion matrix, there's no improvment for all index. 

## Model 3 - reduced model - 5 variables 

In [23]:
stimator = LogisticRegression()
selector = RFE(estimator,6,step =1)
selector = selector.fit(X,y)
print(X.columns)
selector.ranking_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Index(['purchase_value', 'source', 'browser', 'sex', 'age', 'country',
       'continent', 'duration', 'device_count', 'intercept'],
      dtype='object')


array([4, 1, 1, 1, 3, 5, 2, 1, 1, 1])

So if we only 5 variables, we only keep source, browser, sex, duration, device_count

In [24]:
X2 = X[['source','browser', 'sex', 'duration', 'device_count', 'intercept']]

In [25]:
X2_train, X2_test, y_train, y_test = train_test_split(X2,y,test_size = 0.20 ,random_state = 190)

In [26]:
model3 = sm.Logit(y_train, X2_train).fit()
pred_prob3 = model3.predict(X2_test)


Optimization terminated successfully.
         Current function value: 0.188431
         Iterations 7


In [27]:
print(model3.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:               120889
Model:                          Logit   Df Residuals:                   120883
Method:                           MLE   Df Model:                            5
Date:                Tue, 21 Jan 2020   Pseudo R-squ.:                  0.3946
Time:                        10:21:16   Log-Likelihood:                -22779.
converged:                       True   LL-Null:                       -37625.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
source          -0.0118      0.015     -0.777      0.437      -0.041       0.018
browser          0.0050      0.009      0.538      0.591      -0.013       0.023
sex              0.0284      0.028      1.03

In [28]:
X2_test.loc[:,'prediction'] = 0
X2_test.loc[pred_prob3 > 0.5, 'prediction'] = 1 
print(pd.crosstab(y_test,X2_test['prediction'],rownames =['actual'],colnames=['predicted']))

predicted      0     1
actual                
0          27275   136
1           1344  1468


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Accuracy: 

In [29]:
print(metrics.accuracy_score(y_test, X2_test['prediction']))

0.9510306720047645


Precision:

In [30]:
print(metrics.precision_score(y_test, X2_test['prediction']))

0.9152119700748129


Recall: 

In [31]:
print(metrics.recall_score(y_test, X2_test['prediction']))

0.5220483641536273


In [32]:
print(metrics.f1_score(y_test, X2_test['prediction']))

0.6648550724637681


In summary, reduced model didn't perform better than full model. 

### Summary

Based on the model results, we find all the models have a similar performance because of the extremely imbalanced dataset. In most of the observations, they usually shows non-fraudulent. So the prediction is biased and the class is always predicted as non-fruadlent. In this case, we need re-sampling 

## Re-Sampling (under-sampling Majority)

make_imbalanced(): Turns a dataset into an imbalanced dataset at specific ratio.

In [33]:
"""
ref: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.datasets.make_imbalance.html#imblearn.datasets.make_imbalance
"""

'\nref: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.datasets.make_imbalance.html#imblearn.datasets.make_imbalance\n'

In [34]:
df['class'].value_counts()

0    136961
1     14151
Name: class, dtype: int64

In [79]:
14151/(14151+136961)

0.09364577267192546

In [35]:
from imblearn.datasets import make_imbalance
X_res, y_res = make_imbalance(X, y,
                      sampling_strategy={0: 14151, 1: 14151},
                      random_state=40)


+ number of rows :
- Sample data: 28302
- test data: 28302 * 0.2 = 5660.4
- So we import k-fold cross validation for the small dataset. 

## Model 4 - k fold with resampling data

In [36]:

from sklearn.linear_model import LogisticRegressionCV

In [37]:
model4 = LogisticRegressionCV(cv =3, random_state = 109).fit(X_res,y_res)
predicted = model4.predict(X_res)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Confusion Matrix:

In [38]:
#ref: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix
print(metrics.confusion_matrix(y_res,predicted))

[[13470   681]
 [ 4781  9370]]


Accuracy rate:

In [39]:
print(metrics.accuracy_score(y_res, predicted))

0.8070101052929122


Precision rate:

In [40]:
print(metrics.precision_score(y_res, predicted))

0.9322455477066959


Recall rate:

In [41]:
print(metrics.recall_score(y_res, predicted))

0.6621440180905943


F1_score: 

In [42]:
print(metrics.f1_score(y_res, predicted))

0.7743161722171722


### Summary

After resampling, the average recall rate is higher than before. Accuracy rate is 80.7%, and F1-score is 77.4% 

## Model 5 -  cross-validator StratifiedKFold

In [43]:
model5 = LogisticRegressionCV().fit(X,y)
predicted = model5.predict(X)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [44]:
print(metrics.confusion_matrix(y,predicted))

[[136256    705]
 [  6723   7428]]


In [45]:
print(metrics.accuracy_score(y, predicted))

0.9508444067976071


In [46]:
print(metrics.precision_score(y, predicted))

0.9133161195130948


In [47]:
print(metrics.recall_score(y, predicted))

0.5249099003603985


In [48]:
print(metrics.f1_score(y, predicted))

0.6666666666666666


 We don't think stratified K fold is better than model 4

## Model 6 - resample, k-fold, reduced model

In [49]:
X_red_res = X_res[['source','browser', 'sex', 'duration', 'device_count', 'intercept']]
model6 = LogisticRegressionCV(cv =3, random_state = 109).fit(X_red_res,y_res)
predicted = model6.predict(X_red_res)

In [50]:
X_red_res_train1, X_red_res_test1, y_res_train1, y_red_red_test1 = train_test_split(X_red_res,y_res, test_size = 0.20, random_state = 109)
model6_logit = sm.Logit(y_res_train1, X_red_res_train1).fit()
print(model6_logit.summary())

Optimization terminated successfully.
         Current function value: 0.442541
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                22641
Model:                          Logit   Df Residuals:                    22635
Method:                           MLE   Df Model:                            5
Date:                Tue, 21 Jan 2020   Pseudo R-squ.:                  0.3615
Time:                        10:22:07   Log-Likelihood:                -10020.
converged:                       True   LL-Null:                       -15694.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
source          -0.0155      0.019     -0.803      0.422      -0.053       0.022
browser          0.002

In [51]:
print(metrics.confusion_matrix(y_res,predicted))

[[13328   823]
 [ 4408  9743]]


In [52]:
print(metrics.accuracy_score(y_res, predicted))

0.8151720726450428


In [53]:
print(metrics.precision_score(y_res, predicted))

0.9221086503880371


In [54]:
print(metrics.recall_score(y_res, predicted))

0.6885025793230161


In [55]:
print(metrics.f1_score(y_res, predicted))

0.7883642836913864


## Model 7 - reuse model  6 and add (age*age)

In [56]:
X_itr = np.c_[X_red_res, X_res['age'] * X_res['age'] ]

In [57]:
model7 = LogisticRegressionCV(cv =3, random_state = 109).fit(X_itr,y_res)
predicted = model7.predict(X_itr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [58]:
print(metrics.confusion_matrix(y_res,predicted))

[[13505   646]
 [ 4904  9247]]


In [59]:
print(metrics.accuracy_score(y_res, predicted))

0.8039007843968624


In [60]:
print(metrics.precision_score(y_res, predicted))

0.9347013039522895


In [61]:
print(metrics.recall_score(y_res, predicted))

0.6534520528584552


In [62]:
print(metrics.f1_score(y_res, predicted))

0.7691731824987523


## Summary

Among all the logit model, Model 6 performs the best. 

# Decision Tree 

In [63]:
# import the regressor 
from sklearn import tree
# import export_graphviz 
from sklearn.tree import export_graphviz  
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_regression
from sklearn.utils import check_array

## Model 8 - decision tree with full model 

In [64]:
# create a regressor object 
clf = tree.DecisionTreeClassifier(class_weight=None, criterion='gini',  max_features=None, max_leaf_nodes=20, min_samples_leaf=10, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=100, splitter='best')
# you don't want to grow the tree too much, so you need to set max_depth 


In [65]:
X_dct = X.iloc[:,:-1]
X_dct_train, X_dct_test, y_train,y_test = train_test_split(X_dct,y,test_size = 0.2, random_state = 109)

In [66]:
tree = clf.fit(X_dct_train,y_train)
y_prep = tree.predict(X_dct_test)



In [67]:
print(metrics.confusion_matrix(y_test,y_prep))

[[27348    10]
 [ 1283  1582]]


In [68]:
print(metrics.accuracy_score(y_test,y_prep))

0.9572180127717301


In [69]:
print(metrics.precision_score(y_test,y_prep))

0.9937185929648241


In [70]:
print(metrics.recall_score(y_test,y_prep))

0.5521815008726003


In [71]:
print(metrics.f1_score(y_test,y_prep))

0.7098945479021763


## model 9 _ decision tree with resampling dataset 

In [72]:
X_dct_res = X_res.iloc[:,:-1]

In [73]:
X_dct_res_train, X_dct_res_test,  y_res_train,y_res_test = train_test_split(X_dct_res,y_res,test_size = 0.2, random_state = 109)

In [74]:
model9 = clf.fit(X_dct_res_train,y_res_train)
y_prep = model9.predict(X_dct_res_test)



In [75]:
print(metrics.confusion_matrix(y_res_test,y_prep))
print(metrics.accuracy_score(y_res_test,y_prep))
print(metrics.precision_score(y_res_test,y_prep))
print(metrics.recall_score(y_res_test,y_prep))
print(metrics.f1_score(y_res_test,y_prep))

[[2650  178]
 [ 793 2040]]
0.8284755343578873
0.9197475202885482
0.7200847158489234
0.8077608394377351


## model 10 _ decision tree with resampling dataset, 5 variables. 

In [76]:
X_dct_red_res = X_red_res.iloc[:,:-1]
X_dct_red_res_train, X_dct_red_res_test,  y_res_train,y_res_test = train_test_split(X_dct_red_res,y_res,test_size = 0.2, random_state = 109)
model10= clf.fit(X_dct_red_res_train,y_res_train)
predicted = model10.predict(X_dct_red_res_test)



In [77]:
print(metrics.confusion_matrix(y_res_test,y_prep))
print(metrics.accuracy_score(y_res_test,y_prep))
print(metrics.precision_score(y_res_test,y_prep))
print(metrics.recall_score(y_res_test,y_prep))
print(metrics.f1_score(y_res_test,y_prep))

[[2650  178]
 [ 793 2040]]
0.8284755343578873
0.9197475202885482
0.7200847158489234
0.8077608394377351


Because Model 10 and Model 9 produce the same result, we can summarize that other reduced variables are not significant. 

In [78]:
from sklearn.tree import export_graphviz  
export_graphviz(model10, out_file ='decision_tree.dot',feature_names=X_dct_red_res.columns) 

# Summary

Comparing model6 and model10, we find that decision tree model performs better with the same features, as it provides both higher accuracy rate and F1-score.(Accuracy rate  = 82.8%, F1-score = 80.8%). 

# Conclusion 

Based on the decision tree graph from model 10, we found that if a user make purchases at least twice right after he/she sign up in the website, this activity is always fruadulent. In our decision tree model, we found when same device used for at least 2 times and duration between sign up and purchase is less than 0.022 days, 6067 activities are 100% fruadulent with zero gini index. The decision tree also shows that device count and duration between sign up and purchase activities is the two most significant variables to determine an fraudulent activity. 