In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.ensemble import voting_classifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report

In [3]:
from sklearn.utils import resample

In [4]:
train = pd.read_csv('training.csv')

In [5]:
df = train.copy()

In [6]:
df.FraudResult.value_counts()

0    95469
1      193
Name: FraudResult, dtype: int64

In [7]:
minority_class = df[df.FraudResult == 1]
majority_class = df[df.FraudResult == 0]

In [8]:
minority_upsampled = resample(minority_class, replace=True, random_state = 42, n_samples =len(majority_class))

In [9]:
df1 = pd.concat([minority_upsampled, majority_class])

In [10]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190938 entries, 66179 to 95661
Data columns (total 16 columns):
TransactionId           190938 non-null object
BatchId                 190938 non-null object
AccountId               190938 non-null object
SubscriptionId          190938 non-null object
CustomerId              190938 non-null object
CurrencyCode            190938 non-null object
CountryCode             190938 non-null int64
ProviderId              190938 non-null object
ProductId               190938 non-null object
ProductCategory         190938 non-null object
ChannelId               190938 non-null object
Amount                  190938 non-null float64
Value                   190938 non-null int64
TransactionStartTime    190938 non-null object
PricingStrategy         190938 non-null int64
FraudResult             190938 non-null int64
dtypes: float64(1), int64(4), object(11)
memory usage: 24.8+ MB


In [11]:
df1.isnull().sum().sort_values() # no missing values

TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
dtype: int64

In [12]:
# checks for missing values greater than 70% of the total data 
threshold = 0.7
df2 = df1[df1.columns[df1.isnull().mean() < threshold]] 

In [13]:
len(df2)

190938

In [14]:
df2.PricingStrategy.value_counts()

2    148413
4     19049
0     18236
1      5240
Name: PricingStrategy, dtype: int64

In [15]:
df2.Value.head()

66179     500000
93004    9800000
60834     500000
12800     540000
68261     400000
Name: Value, dtype: int64

In [16]:
df2['year'] = pd.DatetimeIndex(df2.TransactionStartTime).year

In [17]:
df2['month'] = pd.DatetimeIndex(df2.TransactionStartTime).month_name
df2['Day'] =pd.DatetimeIndex(df2.TransactionStartTime).weekday_name
df2['week_of_year'] = pd.DatetimeIndex(df2.TransactionStartTime).weekofyear

In [18]:
df2.Day.value_counts() # shopping activities is more predominant on fridays

Friday       42595
Tuesday      30509
Thursday     27825
Monday       26134
Wednesday    24551
Saturday     22324
Sunday       17000
Name: Day, dtype: int64

In [19]:
conditions = [df2.Day.str.contains('Friday'), df2.Day.str.contains('Saturday'), df2.Day.str.contains('Sunday'),
             df2.Day.str.contains('Monday'), df2.Day.str.contains('Tuesday'), df2.Day.str.contains('Wednesday'),
             df2.Day.str.contains('Thursday')]

choices = ['Weekend','Weekend','Weekend', 'Weekday','Weekday','Weekday','Weekday']
df2['time_of_week'] = np.select(conditions, choices, )

In [20]:
df2.Day.head()

66179      Sunday
93004      Monday
60834     Tuesday
12800    Saturday
68261     Tuesday
Name: Day, dtype: object

In [21]:
df2.time_of_week.head()

66179    Weekend
93004    Weekday
60834    Weekday
12800    Weekend
68261    Weekday
Name: time_of_week, dtype: object

In [22]:
df2.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult', 'year',
       'month', 'Day', 'week_of_year', 'time_of_week'],
      dtype='object')

In [None]:
df2.PricingStrategy.dtypes

In [24]:
df2['value_log'] = (df2.Value).transform(np.log)

In [25]:
df2.Amount.head()

66179     500000.0
93004    9800000.0
60834    -500000.0
12800     540000.0
68261     400000.0
Name: Amount, dtype: float64

In [26]:
df2.value_log.head()

66179    13.122363
93004    16.097893
60834    13.122363
12800    13.199324
68261    12.899220
Name: value_log, dtype: float64

In [27]:
df2['amount_log'] = (df2['Amount'] - df2['Amount'].min() + 1).transform(np.log)

In [28]:
df2.amount_log.head()

66179    14.220976
93004    16.195057
60834    13.122365
12800    14.247294
68261    14.151984
Name: amount_log, dtype: float64

In [29]:
for col_name in df2.columns:
    if df2[col_name].dtype == 'object':
        unique_len = len(df2[col_name].unique())
        print('the col name is {} and the number of unique categories are {}'.format(col_name, unique_len))

the col name is TransactionId and the number of unique categories are 95662
the col name is BatchId and the number of unique categories are 94809
the col name is AccountId and the number of unique categories are 3633
the col name is SubscriptionId and the number of unique categories are 3627
the col name is CustomerId and the number of unique categories are 3742
the col name is CurrencyCode and the number of unique categories are 1
the col name is ProviderId and the number of unique categories are 6
the col name is ProductId and the number of unique categories are 23
the col name is ProductCategory and the number of unique categories are 9
the col name is ChannelId and the number of unique categories are 4
the col name is TransactionStartTime and the number of unique categories are 94556


TypeError: unhashable type: 'DatetimeIndex'

In [31]:
df2.CustomerId.value_counts().sort_values(ascending = False)

CustomerId_909     15446
CustomerId_4878    10865
CustomerId_1988     6896
CustomerId_4453     6038
CustomerId_7343     4091
CustomerId_2303     3955
CustomerId_5054     3647
CustomerId_5155     3594
CustomerId_2266     3527
CustomerId_3075     3503
CustomerId_1535     2969
CustomerId_3768     2940
CustomerId_2353     2564
CustomerId_3634     2085
CustomerId_856      1945
CustomerId_647      1869
CustomerId_865      1590
CustomerId_1302     1531
CustomerId_4275     1498
CustomerId_1175     1236
CustomerId_2216     1222
CustomerId_2528     1122
CustomerId_2445     1054
CustomerId_1858     1052
CustomerId_806      1027
CustomerId_7339     1015
CustomerId_4925     1012
CustomerId_1567     1008
CustomerId_4454     1000
CustomerId_7414      999
                   ...  
CustomerId_2717        1
CustomerId_2095        1
CustomerId_1353        1
CustomerId_6           1
CustomerId_3846        1
CustomerId_4415        1
CustomerId_3566        1
CustomerId_7341        1
CustomerId_1754        1


In [32]:
df2.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult', 'year',
       'month', 'Day', 'week_of_year', 'time_of_week', 'value_log',
       'amount_log'],
      dtype='object')

In [33]:
df_use = df2[['ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'PricingStrategy', 'FraudResult', 'year',
       'month', 'Day', 'week_of_year','time_of_week',
       'value_log', 'amount_log']]

In [34]:
df_use.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190938 entries, 66179 to 95661
Data columns (total 15 columns):
ProviderId         190938 non-null object
ProductId          190938 non-null object
ProductCategory    190938 non-null object
ChannelId          190938 non-null object
Amount             190938 non-null float64
Value              190938 non-null int64
PricingStrategy    190938 non-null int64
FraudResult        190938 non-null int64
year               190938 non-null int64
month              190938 non-null object
Day                190938 non-null object
week_of_year       190938 non-null int64
time_of_week       190938 non-null object
value_log          190938 non-null float64
amount_log         190938 non-null float64
dtypes: float64(3), int64(5), object(7)
memory usage: 23.3+ MB


In [35]:
lb = LabelEncoder()

In [36]:
df_use.ChannelId = lb.fit_transform(df_use.ChannelId)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [37]:
df_use['ProviderId'] = lb.fit_transform(df_use.ProviderId)
df_use.ProductId = lb.fit_transform(df_use.ProductId)
df_use.ProductCategory = lb.fit_transform(df_use.ProductCategory)
df_use.time_of_week = lb.fit_transform(df_use.time_of_week)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df_use['month'] = pd.DatetimeIndex(df2.TransactionStartTime).month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
df2 = df2.drop('month', axis = 'columns')

In [40]:
df_use.Day = lb.fit_transform(df_use.Day)

In [41]:
train, test = train_test_split(df_use, test_size = 0.4, random_state = 13)

In [42]:
df = train.copy()

In [43]:
df_label = df.FraudResult

In [44]:
df = df.drop('FraudResult', axis ='columns')

In [45]:
df.Day.head()

68542    5
86208    5
38262    3
42243    4
65326    2
Name: Day, dtype: int32

In [46]:
log_reg = LogisticRegression()

In [47]:
log_reg.fit(df, df_label)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [48]:
log_reg.score(df, df_label)

0.9830921247883242

In [49]:
test_df = test.copy()

In [50]:
test_label = test_df.FraudResult

In [51]:
test_df = test_df.drop('FraudResult', axis ='columns')

In [52]:
pred = log_reg.predict(test_df)

In [53]:
f1_score(test_label, pred)

0.9823595579964269

In [54]:
print(classification_report(test_label, pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     38209
           1       0.99      0.97      0.98     38167

   micro avg       0.98      0.98      0.98     76376
   macro avg       0.98      0.98      0.98     76376
weighted avg       0.98      0.98      0.98     76376



In [55]:
print(confusion_matrix(test_label, pred))

[[37927   282]
 [ 1051 37116]]


In [56]:
dsc = DecisionTreeClassifier()

In [57]:
dsc.fit(df, df_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [58]:
dsc.score(df, df_label)

0.9999039821232171

In [59]:
dsc_pred = dsc.predict(test_df)

In [61]:
print(f1_score(test_label, dsc_pred))

0.9996595075955998


In [62]:
print(confusion_matrix(test_label, dsc_pred))

[[38183    26]
 [    0 38167]]


In [63]:
print(classification_report(test_label, dsc_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     38209
           1       1.00      1.00      1.00     38167

   micro avg       1.00      1.00      1.00     76376
   macro avg       1.00      1.00      1.00     76376
weighted avg       1.00      1.00      1.00     76376

