In [1]:
import pandas as pd

In [2]:
dt = pd.read_csv('qconlondon2016_sample_data.csv')
dt.head()

Unnamed: 0,fraudulent,charge_time,amount,card_country,card_use_24h
0,False,2015-12-31T23:59:59Z,20484,US,0
1,False,2015-12-31T23:59:59Z,1211,US,0
2,False,2015-12-31T23:59:59Z,8396,US,1
3,False,2015-12-31T23:59:59Z,2359,US,0
4,False,2015-12-31T23:59:59Z,1480,US,3


In [3]:
dt.fraudulent.value_counts()

False    45174
True     44219
Name: fraudulent, dtype: int64

In [4]:
dt.card_country.value_counts()

US    84494
GB     2754
AU     2145
Name: card_country, dtype: int64

In [5]:
#Codificando o card_country em uma variável dummy
encoded_countries = pd.get_dummies(dt.card_country, prefix='cc_')
encoded_countries.head()

Unnamed: 0,cc__AU,cc__GB,cc__US
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


In [6]:
dt = dt.join(encoded_countries)
dt.head()

Unnamed: 0,fraudulent,charge_time,amount,card_country,card_use_24h,cc__AU,cc__GB,cc__US
0,False,2015-12-31T23:59:59Z,20484,US,0,0,0,1
1,False,2015-12-31T23:59:59Z,1211,US,0,0,0,1
2,False,2015-12-31T23:59:59Z,8396,US,1,0,0,1
3,False,2015-12-31T23:59:59Z,2359,US,0,0,0,1
4,False,2015-12-31T23:59:59Z,1480,US,3,0,0,1


In [7]:
y = dt.fraudulent
X = dt[['amount', 'card_use_24h', 'cc__AU', 'cc__GB']]

In [8]:
#Dividindo os dados em conjunto de treino e teste
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)



In [9]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

In [10]:
lr_model = LogisticRegression().fit(X_train, y_train)
lr_model.coef_

array([[4.93180746e-06, 2.01707943e-02, 1.98314827e-03, 1.16333341e-03]])

In [11]:
lr_model.intercept_

array([-0.0058])

In [12]:
#Calculando a curva de ROC
y_test_predict_lr = lr_model.predict_proba(X_test)
lr_model.classes_

array([False,  True])

In [13]:
y_test_predict_lr

array([[0.47319066, 0.52680934],
       [0.48845928, 0.51154072],
       [0.48452412, 0.51547588],
       ...,
       [0.49441262, 0.50558738],
       [0.48141824, 0.51858176],
       [0.40983494, 0.59016506]])

In [14]:
y_test_scores_lr = [x[1] for x in y_test_predict_lr]

In [15]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test, y_test_scores_lr)
fpr[len(fpr)//2], tpr[len(tpr)//2], thresholds[len(thresholds)//2]

(0.36669106394304346, 0.6823992813212633, 0.5120631279275438)

In [16]:
#Quanto mais perto de 1 estiver o AUC, melhor será o modelo
roc_auc_score(y_test, y_test_scores_lr)

0.7105776757339008

In [17]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(
    max_depth=3, min_samples_split=20).fit(X_train, y_train)
y_test_scores_dt = [x[1] for x in dt_model.predict_proba(X_test)]
roc_auc_score(y_test, y_test_scores_dt)

0.6974579572529757