In [None]:
#talk about hw, https://www.quora.com/Why-is-AUC-Area-under-ROC-insensitive-to-class-distribution-changes 
#https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/
#ensemble,
#https://www.quora.com/What-will-happen-to-AUC-if-I-switch-the-positive-and-negative-classes-in-the-test-data
#https://medium.com/coinmonks/practical-tips-for-class-imbalance-in-binary-classification-6ee29bcdb8a7
# https://towardsdatascience.com/fine-tuning-a-classifier-in-scikit-learn-66e048c21e65
#http://rikunert.com/SMOTE_explained

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score,  confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix


# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Import and store dataset
fraud_data = pd.read_csv('imbalancedFraudDF.csv')
ipToCountry = pd.read_csv('IpAddress_to_Country.csv')

In [2]:
fraud_data['class'].value_counts()

0    136961
1      1415
Name: class, dtype: int64

In [3]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
3,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0
4,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0


In [4]:
fraud_data.dtypes

user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
dtype: object

In [3]:
#join country info based on ip_address
start = time.time()

countries = []
for i in range(len(fraud_data)):
    ip_address = fraud_data.loc[i, 'ip_address']
    tmp = ipToCountry[(ipToCountry['lower_bound_ip_address'] <= ip_address) &
                    (ipToCountry['upper_bound_ip_address'] >= ip_address)]
    if len(tmp) == 1:
        countries.append(tmp['country'].values[0])
    else:
        countries.append('NA')
        
fraud_data['country'] = countries
print "Lookup took", time.time() - start, "seconds."



Lookup took 209.881716967 seconds.


# Q: How to check if column user_id is unique(no dup)?


In [5]:
print fraud_data.user_id.nunique()#138289
print fraud_data.count()#138376
#most of the user_id has only the first 1 transaction, difficult to do time based aggregates, 
#e.g. amount in past 1 day for the user

138289
user_id           138376
signup_time       138376
purchase_time     138376
purchase_value    138376
device_id         138376
source            138376
browser           138376
sex               138376
age               138376
ip_address        138376
class             138376
country           118096
dtype: int64


In [4]:
fraud_data = pd.read_csv('imbalancedFraudDFwithCountry.csv')

## Feature Engineering

In [6]:
#time related features: can be done before split, as they has no interaction between other rows
fraud_data['interval_after_signup'] = (pd.to_datetime(fraud_data['purchase_time']) - pd.to_datetime(
        fraud_data['signup_time'])).dt.total_seconds()

fraud_data['signup_days_of_year'] = pd.DatetimeIndex(fraud_data['signup_time']).dayofyear

fraud_data['signup_seconds_of_day'] = pd.DatetimeIndex(fraud_data['signup_time']).second + 60 * pd.DatetimeIndex(
    fraud_data['signup_time']).minute + 3600 * pd.DatetimeIndex(fraud_data['signup_time']).hour

fraud_data['purchase_days_of_year'] = pd.DatetimeIndex(fraud_data['purchase_time']).dayofyear
fraud_data['purchase_seconds_of_day'] = pd.DatetimeIndex(fraud_data['purchase_time']).second + 60 * pd.DatetimeIndex(
    fraud_data['purchase_time']).minute + 3600 * pd.DatetimeIndex(fraud_data['purchase_time']).hour



In [7]:
print fraud_data.source.value_counts()

SEO       55747
Ads       54911
Direct    27718
Name: source, dtype: int64


# Q: what now? train/test split? convert categorical features e.g. device_id to numericals? or smote?


In [8]:
# convert categorical features to numericals
fraud_data = pd.get_dummies(fraud_data, columns=['source', 'browser'])#need to drop ['source', 'browser']? no, auto dropped by get_dummies 
fraud_data['sex'] = (fraud_data.sex == 'M').astype(int)

fraudDF = fraud_data.drop(['user_id','signup_time','purchase_time'], axis=1)

# non-missing counts in each features
#fraudDF.apply(lambda x: x.count(), axis=0)

fraudDF.head()




Unnamed: 0,purchase_value,device_id,sex,age,ip_address,class,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari
0,34,QVPSPJUOCKZAR,1,39,732758400.0,0,Japan,4506682.0,55,82549,108,10031,0,0,1,1,0,0,0,0
1,16,EOGFQPIZPYXFZ,0,53,350311400.0,0,United States,17944.0,158,74390,159,5934,1,0,0,1,0,0,0,0
2,44,ATGTXKYKUDUQN,1,41,3840542000.0,0,,492085.0,118,76405,124,50090,0,0,1,0,0,0,0,1
3,39,NAUITBZFJKHWW,1,45,415583100.0,0,United States,4361461.0,202,25792,252,67253,1,0,0,0,0,0,0,1
4,42,ALEYXFXINSXLZ,1,18,2809315000.0,0,Canada,4240931.0,141,21783,190,29114,1,0,0,1,0,0,0,0


In [9]:


y = fraudDF['class']
X = fraudDF.drop(['class'], axis=1)

#split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)


('X_train.shape:', (110700, 19))
('y_train.shape:', (110700,))


In [10]:
X_train.head()

Unnamed: 0,purchase_value,device_id,sex,age,ip_address,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari
29343,12,OULPAZAFRFPXP,1,42,3690922000.0,Korea Republic of,3499664.0,183,67384,224,24648,1,0,0,1,0,0,0,0
12190,10,AIIWMFEYQQIEB,1,29,1686759000.0,United States,6766039.0,5,78146,84,18585,1,0,0,0,0,0,1,0
19388,34,VUVETBUPCIWJE,1,53,4138429000.0,,5870515.0,197,81354,265,76669,0,1,0,1,0,0,0,0
89104,48,QCFULAJOYKFUU,1,29,96173370.0,France,2145618.0,160,30920,185,16538,1,0,0,1,0,0,0,0
82082,44,IHRWLMIJMEEEU,1,24,1936025000.0,China,7079059.0,111,71897,193,66156,1,0,0,0,1,0,0,0


In [11]:
#converting needs to be done after split, and needs to be standalized/normalized, otherwise, 
#train count range for x: [1000, 5000] count, while test range [100, 500], different distribution :(

# the more a device is shared, the more suspicious
X_train['n_dev_shared'] = X_train.device_id.map(X_train.device_id.value_counts(dropna=False))#without dropna=False will produce nan in this col

# the more a ip is shared, the more suspicious
X_train['n_ip_shared'] = X_train.ip_address.map(X_train.ip_address.value_counts(dropna=False))

# the less visit from a country, the more suspicious
X_train['n_country_shared'] = X_train.country.map(X_train.country.value_counts(dropna=False))#lots of NAs in country column

X_train = X_train.drop(['device_id','ip_address','country'], axis=1)


# the more a device is shared, the more suspicious
X_test['n_dev_shared'] = X_test.device_id.map(X_test.device_id.value_counts(dropna=False))

# the more a ip is shared, the more suspicious
X_test['n_ip_shared'] = X_test.ip_address.map(X_test.ip_address.value_counts(dropna=False))

# the less visit from a country, the more suspicious
X_test['n_country_shared'] = X_test.country.map(X_test.country.value_counts(dropna=False))

X_test = X_test.drop(['device_id','ip_address','country'], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-d

# Q: What is the difference between normalization and standardization?


In [12]:


# normalize to [0,1], standardize to normal, mu=0, can < 0, so we do normalize here

min_max_scaler = preprocessing.MinMaxScaler()
#fit and transform the training data and use them for the model training
X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']] = min_max_scaler.fit_transform(X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])


#after the model training on the transformed training data define the testing data df_test

#before the prediction of the test data, apply the same scaler obtained from above on X_test
X_test[['n_dev_shared', 'n_ip_shared', 'n_country_shared']] = min_max_scaler.fit_transform(X_test[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])


  return self.partial_fit(X, y)


In [13]:
X_train.n_dev_shared.value_counts(dropna=False)

0.0    105373
0.2      4756
0.4       375
0.6       132
0.8        40
1.0        24
Name: n_dev_shared, dtype: int64

In [14]:
X_test.n_dev_shared.value_counts(dropna=False)

0.000000    27310
0.333333      350
0.666667       12
1.000000        4
Name: n_dev_shared, dtype: int64

In [29]:


classifier_RF = RandomForestClassifier(random_state=0)

classifier_RF.fit(X_train, y_train)

# predict class labels for the test set
predicted = classifier_RF.predict(X_test)

# generate class probabilities
probs = classifier_RF.predict_proba(X_test)


# generate evaluation metrics

print("\t%s: %r" % ("accuracy_score is: ", accuracy_score(y_test, predicted)))
# print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score((y_test == '1').astype(int), probs[:, 1])))#string to int
# print("\t%s: %r" % ("f1_score is: ", f1_score((y_test == '1').astype(int), (predicted == '1').astype(int) )))#string to int
print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, probs[:, 1])))
print("\t%s: %r" % ("f1_score is: ", f1_score(y_test, predicted )))#string to int

print ("confusion_matrix is: ")
cm = confusion_matrix(y_test, predicted)
print cm
print 'recall =',float(cm[1,1])/(cm[1,0]+cm[1,1])
print 'precision =', float(cm[1,1])/(cm[1,1] + cm[0,1])#1.0

	accuracy_score is: : 0.9957002457002457
	roc_auc_score is: : 0.8034143644483027
	f1_score is: : 0.7384615384615384
confusion_matrix is: 
[[27389     0]
 [  119   168]]
recall = 0.585365853659
precision = 1.0


In [50]:
classifier_RF5 = RandomForestClassifier(random_state=0)
classifier_RF5

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [30]:
classifier_RF

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=5, verbose=0, warm_start=False)

In [None]:
RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

# SMOTE

In [16]:

#https://beckernick.github.io/oversampling-modeling/

#Wrong way to sampling: smote before split train/test, then test data does not reflect true distribution in reality, 
#and “bleed” information from the test set into the training of the model. overfit! think about the case of simple 
#oversampling (where I just duplicate observations). If I upsample a dataset before splitting it into a train and 
#validation set, I could end up with the same observation in both datasets

#https://imbalanced-learn.org/en/stable/install.html

# Install
# imbalanced-learn is currently available on the PyPi’s reporitories and you can install it via pip:

# pip install -U imbalanced-learn

#oversampling on only the training data, the right way!
sm = SMOTE(random_state=12)
x_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)


unique, counts = np.unique(y_train_sm, return_counts=True)

print np.asarray((unique, counts)).T

[[     0 109572]
 [     1 109572]]


In [17]:
#RF on smoted training data
classifier_RF_sm = RandomForestClassifier(random_state=0)

classifier_RF_sm.fit(x_train_sm, y_train_sm)

# predict class labels for the test set
predicted_sm = classifier_RF_sm.predict(X_test)

# generate class probabilities
probs_sm = classifier_RF_sm.predict_proba(X_test)


# generate evaluation metrics

print("\t%s: %r" % ("accuracy_score_sm is: ", accuracy_score(y_test, predicted_sm)))
# print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score((y_test == '1').astype(int), probs[:, 1])))#string to int
# print("\t%s: %r" % ("f1_score is: ", f1_score((y_test == '1').astype(int), (predicted == '1').astype(int) )))#string to int
print("\t%s: %r" % ("roc_auc_score_sm is: ", roc_auc_score(y_test, probs_sm[:, 1])))
print("\t%s: %r" % ("f1_score_sm is: ", f1_score(y_test, predicted_sm )))#string to int

print ("confusion_matrix_sm is: ")
cm_sm = confusion_matrix(y_test, predicted_sm)
print cm_sm
print 'recall or sens_sm =',float(cm_sm[1,1])/(cm_sm[1,0]+cm_sm[1,1])
print 'precision_sm =', float(cm_sm[1,1])/(cm_sm[1,1] + cm_sm[0,1])



	accuracy_score_sm is: : 0.995085995085995
	roc_auc_score_sm is: : 0.8022659215028591
	f1_score_sm is: : 0.6909090909090909
confusion_matrix_sm is: 
[[27388     1]
 [  135   152]]
recall or sens_sm = 0.529616724739
precision_sm = 0.993464052288


In [19]:

#2nd column is predicted 1s
unique, counts = np.unique(predicted_sm, return_counts=True)

print np.asarray((unique, counts)).T

[[    0 27523]
 [    1   153]]


In [20]:
#2nd row is true 1s y_test
unique, counts = np.unique(y_test, return_counts=True)

print np.asarray((unique, counts)).T

[[    0 27389]
 [    1   287]]


In [21]:
# from sklearn.grid_search import GridSearchCV
#https://stackoverflow.com/questions/40615021/cannot-import-sklearn-model-selection-in-scikit-learn
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import GridSearchCV
def print_grid_search_metrics(gs):
    print "Best score: %0.3f" % gs.best_score_
    print "Best parameters set:"
    best_parameters = gs.best_params_
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

0.20.2


In [74]:
parameters = {        
'max_depth': [None, 5, 15],
'n_estimators' :  [10,150],
'class_weight' : [{0: 1, 1: w} for w in [0.2, 1, 50, 100]]
}


scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score, pos_label=1)
}
clf = RandomForestClassifier(random_state=0)



In [70]:
def grid_search_wrapper(refit_score='f1_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization(refit on the best model according to refit_score)
    prints classifier performance metrics
    """
#     skf = StratifiedKFold(n_splits=10)
#     grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
#                            cv=skf, return_train_score=True, n_jobs=-1)
    grid_search = GridSearchCV(clf, parameters, scoring=scorers, refit=refit_score,
                           cv=3, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # make the predictions
    y_pred = grid_search.predict(X_test)
    y_prob = grid_search.predict_proba(X_test)[:, 1]
    
    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    cm = confusion_matrix(y_test, y_pred)
    cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['0', '1'])
    print(cmDF)
    
    print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, y_prob)))
    print("\t%s: %r" % ("f1_score is: ", f1_score(y_test, y_pred)))#string to int

    print 'recall = ', float(cm[1,1]) / (cm[1,0] + cm[1,1])
    print 'precision = ', float(cm[1,1]) / (cm[1, 1] + cm[0,1])

    return grid_search


In [80]:
grid_search_rf_f1 = grid_search_wrapper(refit_score='f1_score')


Best params for f1_score
{'n_estimators': 150, 'max_depth': None, 'class_weight': {0: 1, 1: 0.2}}

Confusion matrix of Random Forest optimized for f1_score on the test data:
   pred_0  pred_1
0   27389       0
1     117     170
	roc_auc_score is: : 0.8081450843143492
	f1_score is: : 0.7439824945295405
recall =  0.592334494774
precision =  1.0


In [81]:

best_rf_model_f1 = grid_search_rf_f1.best_estimator_
best_rf_model_f1

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 0.2},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [82]:
pd.DataFrame(best_rf_model_f1.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

Unnamed: 0,importance
interval_after_signup,0.405883
purchase_days_of_year,0.146444
signup_seconds_of_day,0.072164
purchase_seconds_of_day,0.070753
signup_days_of_year,0.061146
n_ip_shared,0.060882
n_dev_shared,0.040845
purchase_value,0.038808
age,0.033563
n_country_shared,0.02652


In [63]:
results_f1 = pd.DataFrame(grid_search_rf_f1.cv_results_)
# results_sortrecall = results_f1.sort_values(by='mean_test_recall_score', ascending=False)
results_sortf1 = results_f1.sort_values(by='mean_test_f1_score', ascending=False)
results_sortf1.head(100)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_f1_score,mean_test_precision_score,mean_test_recall_score,mean_train_f1_score,mean_train_precision_score,mean_train_recall_score,param_class_weight,param_max_depth,param_n_estimators,params,rank_test_f1_score,rank_test_precision_score,rank_test_recall_score,split0_test_f1_score,split0_test_precision_score,split0_test_recall_score,split0_train_f1_score,split0_train_precision_score,split0_train_recall_score,split1_test_f1_score,split1_test_precision_score,split1_test_recall_score,split1_train_f1_score,split1_train_precision_score,split1_train_recall_score,split2_test_f1_score,split2_test_precision_score,split2_test_recall_score,split2_train_f1_score,split2_train_precision_score,split2_train_recall_score,std_fit_time,std_score_time,std_test_f1_score,std_test_precision_score,std_test_recall_score,std_train_f1_score,std_train_precision_score,std_train_recall_score
1,31.606647,2.847271,0.731693,1.0,0.577128,1.0,1.0,1.0,"{0: 1, 1: 0.2}",,150,"{u'n_estimators': 150, u'max_depth': None, u'c...",1,1,3,0.721088,1.0,0.56383,1.0,1.0,1.0,0.752902,1.0,0.603723,1.0,1.0,1.0,0.721088,1.0,0.56383,1.0,1.0,1.0,0.277702,0.059518,0.014997,0.0,0.018806,0.0,0.0,0.0
5,28.102901,2.477965,0.73031,1.0,0.575355,0.761106,1.0,0.614362,"{0: 1, 1: 0.2}",15.0,150,"{u'n_estimators': 150, u'max_depth': 15, u'cla...",2,1,6,0.721088,1.0,0.56383,0.76112,1.0,0.614362,0.748752,1.0,0.598404,0.755997,1.0,0.607713,0.721088,1.0,0.56383,0.766202,1.0,0.621011,0.790251,0.134821,0.013041,0.0,0.016299,0.004166,0.0,0.005429
0,1.163817,0.156864,0.73011,0.993948,0.577128,0.934585,1.0,0.877216,"{0: 1, 1: 0.2}",,10,"{u'n_estimators': 10, u'max_depth': None, u'cl...",3,12,3,0.719864,0.995305,0.56383,0.938603,1.0,0.884309,0.746269,0.991189,0.598404,0.932576,1.0,0.87367,0.724196,0.995349,0.569149,0.932576,1.0,0.87367,0.110805,0.004152,0.011562,0.001951,0.015201,0.002841,0.0,0.005015
7,30.6636,3.149158,0.729902,0.998435,0.575355,1.0,1.0,1.0,"{0: 1, 1: 1}",,150,"{u'n_estimators': 150, u'max_depth': None, u'c...",4,7,6,0.719864,0.995305,0.56383,1.0,1.0,1.0,0.748752,1.0,0.598404,1.0,1.0,1.0,0.721088,1.0,0.56383,1.0,1.0,1.0,0.594302,0.457916,0.013339,0.002213,0.016299,0.0,0.0,0.0
4,2.016783,0.220595,0.729004,0.992479,0.576241,0.778741,1.0,0.637855,"{0: 1, 1: 0.2}",15.0,10,"{u'n_estimators': 10, u'max_depth': 15, u'clas...",5,14,5,0.724196,0.995349,0.569149,0.797762,1.0,0.663564,0.742952,0.986784,0.595745,0.769231,1.0,0.625,0.719864,0.995305,0.56383,0.769231,1.0,0.625,0.065776,0.007681,0.01002,0.004027,0.013961,0.01345,0.0,0.018179
11,30.103354,2.665267,0.728447,0.998435,0.573582,0.768501,1.0,0.624113,"{0: 1, 1: 1}",15.0,150,"{u'n_estimators': 150, u'max_depth': 15, u'cla...",6,7,8,0.719864,0.995305,0.56383,0.771242,1.0,0.62766,0.748752,1.0,0.598404,0.757025,1.0,0.609043,0.716724,1.0,0.558511,0.777236,1.0,0.635638,0.308259,0.083996,0.014415,0.002213,0.017686,0.008476,0.0,0.011143
6,2.31062,0.252063,0.727821,0.998498,0.572695,0.934323,1.0,0.876773,"{0: 1, 1: 1}",,10,"{u'n_estimators': 10, u'max_depth': None, u'cl...",7,6,9,0.725424,1.0,0.569149,0.940099,1.0,0.886968,0.73913,0.995495,0.587766,0.933333,1.0,0.875,0.71891,1.0,0.56117,0.929537,1.0,0.868351,0.022338,0.019209,0.008427,0.002123,0.011143,0.004368,0.0,0.007703
10,1.929117,0.211867,0.724455,0.996932,0.569149,0.780487,1.0,0.640071,"{0: 1, 1: 1}",15.0,10,"{u'n_estimators': 10, u'max_depth': 15, u'clas...",8,10,11,0.715503,0.995261,0.558511,0.784155,1.0,0.644947,0.743333,0.995536,0.593085,0.769231,1.0,0.625,0.71453,1.0,0.555851,0.788074,1.0,0.650266,0.014316,0.011516,0.013355,0.002172,0.01696,0.008118,0.0,0.010876
13,29.623303,2.710693,0.724198,0.99842,0.568262,1.0,1.0,1.0,"{0: 1, 1: 100}",,150,"{u'n_estimators': 150, u'max_depth': None, u'c...",9,9,13,0.715503,0.995261,0.558511,1.0,1.0,1.0,0.740369,1.0,0.587766,1.0,1.0,1.0,0.716724,1.0,0.558511,1.0,1.0,1.0,0.187939,0.020311,0.011445,0.002234,0.013791,0.0,0.0,0.0
12,1.663757,0.218581,0.716382,0.996732,0.559397,0.935091,1.0,0.878103,"{0: 1, 1: 100}",,10,"{u'n_estimators': 10, u'max_depth': None, u'cl...",10,11,14,0.696552,0.990196,0.537234,0.937853,1.0,0.882979,0.742475,1.0,0.590426,0.934844,1.0,0.87766,0.71012,1.0,0.550532,0.932576,1.0,0.87367,0.02152,0.005309,0.019264,0.004622,0.022602,0.002161,0.0,0.003813


In [64]:
results_sortf1.shape

(18, 41)

In [75]:
grid_search_rf_recall = grid_search_wrapper(refit_score='recall_score')
results_recall = pd.DataFrame(grid_search_rf_recall.cv_results_)
results_sortrecall = results_recall.sort_values(by='mean_test_recall_score', ascending=False)
# results_sortf1 = results_f1.sort_values(by='mean_test_f1_score', ascending=False)
results_sortrecall.head(100)
#recall is worse than default rf?? no this is on test, but train recall is better

Best params for recall_score
{'n_estimators': 150, 'max_depth': 5, 'class_weight': {0: 1, 1: 100}}

Confusion matrix of Random Forest optimized for recall_score on the test data:
   pred_0  pred_1
0   27137     252
1     123     164
	roc_auc_score is: : 0.8152250394783226
	f1_score is: : 0.4665718349928876
recall =  0.571428571429
precision =  0.394230769231


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_f1_score,mean_test_precision_score,mean_test_recall_score,mean_train_f1_score,mean_train_precision_score,mean_train_recall_score,param_class_weight,param_max_depth,param_n_estimators,params,rank_test_f1_score,rank_test_precision_score,rank_test_recall_score,split0_test_f1_score,split0_test_precision_score,split0_test_recall_score,split0_train_f1_score,split0_train_precision_score,split0_train_recall_score,split1_test_f1_score,split1_test_precision_score,split1_test_recall_score,split1_train_f1_score,split1_train_precision_score,split1_train_recall_score,split2_test_f1_score,split2_test_precision_score,split2_test_recall_score,split2_train_f1_score,split2_train_precision_score,split2_train_recall_score,std_fit_time,std_score_time,std_test_f1_score,std_test_precision_score,std_test_recall_score,std_train_f1_score,std_train_precision_score,std_train_recall_score
15,11.667655,1.425179,0.265384,0.166135,0.662234,0.271299,0.169551,0.679078,"{0: 1, 1: 100}",5.0,150,"{u'n_estimators': 150, u'max_depth': 5, u'clas...",18,18,1,0.251777,0.155583,0.659574,0.266181,0.165428,0.680851,0.260646,0.161475,0.675532,0.264997,0.164828,0.675532,0.283729,0.181347,0.651596,0.282717,0.178397,0.680851,0.58732,0.059158,0.013468,0.011022,0.009951,0.008088,0.00626,0.002507
14,0.90345,0.156921,0.295,0.193372,0.649823,0.299348,0.194968,0.663121,"{0: 1, 1: 100}",5.0,10,"{u'n_estimators': 10, u'max_depth': 5, u'class...",17,17,2,0.248283,0.152226,0.672872,0.255362,0.157152,0.680851,0.285387,0.181885,0.662234,0.292108,0.187595,0.659574,0.351331,0.246006,0.614362,0.350575,0.240157,0.648936,0.015403,0.014813,0.042615,0.039138,0.025448,0.039206,0.034286,0.013268
0,1.359088,0.198078,0.73011,0.993948,0.577128,0.934585,1.0,0.877216,"{0: 1, 1: 0.2}",,10,"{u'n_estimators': 10, u'max_depth': None, u'cl...",3,12,3,0.719864,0.995305,0.56383,0.938603,1.0,0.884309,0.746269,0.991189,0.598404,0.932576,1.0,0.87367,0.724196,0.995349,0.569149,0.932576,1.0,0.87367,0.083948,0.011483,0.011562,0.001951,0.015201,0.002841,0.0,0.005015
1,30.390795,2.997346,0.731693,1.0,0.577128,1.0,1.0,1.0,"{0: 1, 1: 0.2}",,150,"{u'n_estimators': 150, u'max_depth': None, u'c...",1,1,3,0.721088,1.0,0.56383,1.0,1.0,1.0,0.752902,1.0,0.603723,1.0,1.0,1.0,0.721088,1.0,0.56383,1.0,1.0,1.0,0.831439,0.088937,0.014997,0.0,0.018806,0.0,0.0,0.0
4,1.984478,0.211391,0.729004,0.992479,0.576241,0.778741,1.0,0.637855,"{0: 1, 1: 0.2}",15.0,10,"{u'n_estimators': 10, u'max_depth': 15, u'clas...",5,14,5,0.724196,0.995349,0.569149,0.797762,1.0,0.663564,0.742952,0.986784,0.595745,0.769231,1.0,0.625,0.719864,0.995305,0.56383,0.769231,1.0,0.625,0.061699,0.002563,0.01002,0.004027,0.013961,0.01345,0.0,0.018179
5,26.715077,2.510168,0.73031,1.0,0.575355,0.761106,1.0,0.614362,"{0: 1, 1: 0.2}",15.0,150,"{u'n_estimators': 150, u'max_depth': 15, u'cla...",2,1,6,0.721088,1.0,0.56383,0.76112,1.0,0.614362,0.748752,1.0,0.598404,0.755997,1.0,0.607713,0.721088,1.0,0.56383,0.766202,1.0,0.621011,0.484985,0.099954,0.013041,0.0,0.016299,0.004166,0.0,0.005429
7,30.151817,3.006246,0.729902,0.998435,0.575355,1.0,1.0,1.0,"{0: 1, 1: 1}",,150,"{u'n_estimators': 150, u'max_depth': None, u'c...",4,7,6,0.719864,0.995305,0.56383,1.0,1.0,1.0,0.748752,1.0,0.598404,1.0,1.0,1.0,0.721088,1.0,0.56383,1.0,1.0,1.0,0.728879,0.096646,0.013339,0.002213,0.016299,0.0,0.0,0.0
11,28.038546,2.512766,0.728447,0.998435,0.573582,0.768501,1.0,0.624113,"{0: 1, 1: 1}",15.0,150,"{u'n_estimators': 150, u'max_depth': 15, u'cla...",6,7,8,0.719864,0.995305,0.56383,0.771242,1.0,0.62766,0.748752,1.0,0.598404,0.757025,1.0,0.609043,0.716724,1.0,0.558511,0.777236,1.0,0.635638,0.269197,0.037151,0.014415,0.002213,0.017686,0.008476,0.0,0.011143
6,2.045982,0.225805,0.727821,0.998498,0.572695,0.934323,1.0,0.876773,"{0: 1, 1: 1}",,10,"{u'n_estimators': 10, u'max_depth': None, u'cl...",7,6,9,0.725424,1.0,0.569149,0.940099,1.0,0.886968,0.73913,0.995495,0.587766,0.933333,1.0,0.875,0.71891,1.0,0.56117,0.929537,1.0,0.868351,0.005071,0.005666,0.008427,0.002123,0.011143,0.004368,0.0,0.007703
17,15.845948,1.494833,0.71388,0.953491,0.570922,0.911244,0.98355,0.848848,"{0: 1, 1: 100}",15.0,150,"{u'n_estimators': 150, u'max_depth': 15, u'cla...",11,15,10,0.700997,0.933628,0.56117,0.899857,0.973684,0.836436,0.727569,0.940928,0.593085,0.913105,0.983129,0.852394,0.713073,0.985915,0.558511,0.920771,0.993837,0.857713,0.720479,0.005477,0.010863,0.023121,0.015709,0.008639,0.008233,0.009041


# Optimized by f1

In [40]:


Grid_RF_f1 = GridSearchCV(RandomForestClassifier(random_state=0),parameters,n_jobs=-1,
                       verbose=2, scoring='f1', cv=3)
Grid_RF_f1.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  2.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [10, 150], 'max_depth': [None, 5, 15], 'class_weight': [{0: 1, 1: 0.2}, {0: 1, 1: 1}, {0: 1, 1: 100}]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=2)

In [41]:
print_grid_search_metrics(Grid_RF_f1)

Best score: 0.732
Best parameters set:
	class_weight: {0: 1, 1: 0.2}
	max_depth: None
	n_estimators: 150


In [52]:
X_train.shape

(110700, 19)

In [51]:


# predict class labels for the train set
predictedBest_f1_train = best_RF_model_f1.predict(X_train)

# generate class probabilities
probsBest_f1_train = best_RF_model_f1.predict_proba(X_train)


# generate evaluation metrics

print("\t%s: %r" % ("best accuracy_score_f1_train is: ", accuracy_score(y_train, predictedBest_f1_train)))
print("\t%s: %r" % ("best roc_auc_score_f1_train is: ", roc_auc_score(y_train, probsBest_f1_train[:, 1])))
print("\t%s: %r" % ("best f1_score_f1_train is: ", f1_score(y_train, predictedBest_f1_train )))


print ("best confusion_matrix_train by f1 is: ")
bestcm_f1_train = confusion_matrix(y_train, predictedBest_f1_train)
print bestcm_f1_train
print 'best recall_train by f1 =',float(bestcm_f1_train[1,1])/(bestcm_f1_train[1,0]+bestcm_f1_train[1,1])
print 'best precision_train by f1 =', float(bestcm_f1_train[1,1])/(bestcm_f1_train[1,1] + bestcm_f1_train[0,1])

	best accuracy_score_f1_train is: : 0.9987172538392051
	best roc_auc_score_f1_train is: : 0.9999898662765997
	best f1_score_f1_train is: : 0.9328287606433302
best confusion_matrix_train by f1 is: 
[[109572      0]
 [   142    986]]
best recall_train by f1 = 0.874113475177
best precision_train by f1 = 1.0


In [33]:
best_RF_model_f1 = Grid_RF_f1.best_estimator_
best_RF_model_f1

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 0.2},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [None]:
pd.DataFrame(best_RF_model_f1.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

In [42]:


# predict class labels for the test set
predictedBest_f1 = best_RF_model_f1.predict(X_test)

# generate class probabilities
probsBest_f1 = best_RF_model_f1.predict_proba(X_test)


# generate evaluation metrics

print("\t%s: %r" % ("best accuracy_score_f1 is: ", accuracy_score(y_test, predictedBest_f1)))
print("\t%s: %r" % ("best roc_auc_score_f1 is: ", roc_auc_score(y_test, probsBest_f1[:, 1])))
print("\t%s: %r" % ("best f1_score_f1 is: ", f1_score(y_test, predictedBest_f1 )))


print ("best confusion_matrix by f1 is: ")
bestcm_f1 = confusion_matrix(y_test, predictedBest_f1)
print bestcm_f1
print 'best recall by f1 =',float(bestcm_f1[1,1])/(bestcm_f1[1,0]+bestcm_f1[1,1])
print 'best precision by f1 =', float(bestcm_f1[1,1])/(bestcm_f1[1,1] + bestcm_f1[0,1])

	best accuracy_score_f1 is: : 0.9957002457002457
	best roc_auc_score_f1 is: : 0.7967131314830098
	best f1_score_f1 is: : 0.7384615384615384
best confusion_matrix by f1 is: 
[[27389     0]
 [  119   168]]
best recall by f1 = 0.585365853659
best precision by f1 = 1.0


# Optimized by recall in GridSearchCV

In [None]:
#Assumption, manager want to optimize recall(capture fraud), regardless of precision

In [43]:

Grid_RFz_recall = GridSearchCV(RandomForestClassifier(random_state=0),parameters,n_jobs=-1,
                       verbose=2, scoring='recall', cv=3)
Grid_RFz_recall.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  2.0min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [10, 150], 'max_depth': [None, 5, 15], 'class_weight': [{0: 1, 1: 0.2}, {0: 1, 1: 1}, {0: 1, 1: 100}]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=2)

In [44]:
print_grid_search_metrics(Grid_RFz_recall)


Best score: 0.662
Best parameters set:
	class_weight: {0: 1, 1: 100}
	max_depth: 5
	n_estimators: 150


In [46]:
best_RF_model_recall = Grid_RFz_recall.best_estimator_
best_RF_model_recall

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [47]:
# predict class labels for the test set
predictedBest_recall = best_RF_model_recall.predict(X_test)

# generate class probabilities
probsBest_recall = best_RF_model_recall.predict_proba(X_test)


# generate evaluation metrics

print("\t%s: %r" % ("best accuracy_score_recall is: ", accuracy_score(y_test, predictedBest_recall)))
print("\t%s: %r" % ("best roc_auc_score_recall is: ", roc_auc_score(y_test, probsBest_recall[:, 1])))
print("\t%s: %r" % ("best f1_score_recall is: ", f1_score(y_test, predictedBest_recall )))




print ("best confusion_matrix by recall is: ")
bestcm_recall = confusion_matrix(y_test, predictedBest_recall)
print bestcm_recall
print 'best recall optimized by recall =',float(bestcm_recall[1,1])/(bestcm_recall[1,0]+bestcm_recall[1,1])
print 'best precision optimized by recall =', float(bestcm_recall[1,1])/(bestcm_recall[1,1] + bestcm_recall[0,1])

	best accuracy_score_recall is: : 0.9864503540974129
	best roc_auc_score_recall is: : 0.8152250394783226
	best f1_score_recall is: : 0.4665718349928876
best confusion_matrix by recall is: 
[[27137   252]
 [  123   164]]
best recall optimized by recall = 0.571428571429
best precision optimized by recall = 0.394230769231


In [72]:
pd.DataFrame(best_RF_model_recall.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

Unnamed: 0,importance
n_dev_shared,0.310133
interval_after_signup,0.249985
purchase_days_of_year,0.195365
signup_days_of_year,0.106627
n_ip_shared,0.103732
purchase_seconds_of_day,0.007357
signup_seconds_of_day,0.007107
age,0.005625
purchase_value,0.005614
n_country_shared,0.003221


In [96]:
#for task 3, based on the above var importance
trainDF = pd.concat([X_train, y_train], axis=1)
pd.crosstab(trainDF["n_dev_shared"],trainDF["class"])

class,0,1
n_dev_shared,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,104960,413
0.2,4397,359
0.4,167,208
0.6,35,97
0.8,9,31
1.0,4,20


In [77]:
fraudDF.groupby("class")[['interval_after_signup']].mean()


Unnamed: 0_level_0,interval_after_signup
class,Unnamed: 1_level_1
0,5191179.0
1,2354059.0


In [None]:
fraudDF.groupby("class")[['interval_after_signup']].median()
#half of fraud happened 1 s after signed up

In [85]:
fraud_data[fraud_data['class'] == 1].head(100)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,sex,age,ip_address,class,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari
136961,86574,2015-01-11 14:39:04,2015-01-11 14:39:05,61,ZQVCVEXPSAZCA,1,23,3.941890e+08,1,United States,1.0,11,52744,11,52745,0,0,1,0,0,0,0,1
136962,74145,2015-03-27 07:55:56,2015-07-20 06:59:34,55,DFQZESNZDEIPP,0,18,1.930175e+09,1,Korea Republic of,9932618.0,86,28556,201,25174,1,0,0,0,1,0,0,0
136963,282058,2015-01-05 08:18:00,2015-01-05 08:18:01,36,EGLGSEGYPMMAM,1,37,3.503224e+09,1,United States,1.0,5,29880,5,29881,0,0,1,0,1,0,0,0
136964,255332,2015-01-01 01:46:58,2015-01-01 01:46:59,19,SQWWBFDRYPGYA,0,36,2.105295e+08,1,United States,1.0,1,6418,1,6419,0,0,1,0,0,1,0,0
136965,399497,2015-01-01 14:29:27,2015-01-22 15:14:12,52,JWAVUHXQXTCHW,1,22,1.052881e+09,1,Nigeria,1817085.0,1,52167,22,54852,0,1,0,1,0,0,0,0
136966,337297,2015-02-10 13:01:00,2015-04-14 05:08:57,50,BCKHXUUTHTMMW,0,31,8.000143e+08,1,Canada,5414877.0,41,46860,104,18537,0,0,1,0,1,0,0,0
136967,46163,2015-03-05 09:23:48,2015-06-03 01:57:00,12,YFFPJJLHEHZML,1,45,1.957369e+09,1,China,7749192.0,64,33828,154,7020,0,1,0,0,0,0,0,1
136968,17097,2015-01-08 10:34:16,2015-01-08 10:34:17,12,IGAXAVAZFJYOB,1,33,2.447058e+09,1,Netherlands,1.0,8,38056,8,38057,1,0,0,0,0,0,0,1
136969,318808,2015-01-02 06:01:09,2015-01-02 06:01:10,22,MLKPWLXKZYANO,0,33,2.543532e+09,1,United States,1.0,2,21669,2,21670,1,0,0,1,0,0,0,0
136970,303431,2015-01-11 19:16:05,2015-01-11 19:16:06,61,SIKJNYXDSEUEG,1,30,3.410478e+09,1,Taiwan; Republic of China (ROC),1.0,11,69365,11,69366,1,0,0,1,0,0,0,0


In [83]:

fraudDF.groupby("class")[['purchase_days_of_year']].mean()

Unnamed: 0_level_0,purchase_days_of_year
class,Unnamed: 1_level_1
0,175.08996
1,85.007067


In [84]:
fraudDF.groupby("class")[['purchase_days_of_year']].median()

Unnamed: 0_level_0,purchase_days_of_year
class,Unnamed: 1_level_1
0,175
1,12


In [102]:
#for task 4, how to use the prediction:
t = (10 * probsBest_recall[:, 1]).astype(int)
unique, counts = np.unique(t, return_counts=True)

print np.asarray((unique, counts)).T
#green: 1 - 3 pass
#grey: 4-7 need manual investigation
#red: 8,9 decline

[[    1     2]
 [    2 26975]
 [    3   202]
 [    4    81]
 [    5   248]
 [    6     8]
 [    7     6]
 [    8    39]
 [    9   115]]


In [None]:
#HW: try xgboost
Q3 var imp freq table
Q4 buckets red yellow green