In [518]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk


In [519]:
df = pd.read_csv("creditcard.csv",  true_values=["Y"], false_values=["N"])
print(f"Number of rows {len(df.index)}")
print(f"The columns of the database {df.columns}")
df.value_counts("isFradulent")

Number of rows 3075
The columns of the database Index(['Merchant_id', 'Transaction date', 'Average Amount/transaction/day',
       'Transaction_amount', 'Is declined', 'Total Number of declines/day',
       'isForeignTransaction', 'isHighRiskCountry', 'Daily_chargeback_avg_amt',
       '6_month_avg_chbk_amt', '6-month_chbk_freq', 'isFradulent'],
      dtype='object')


isFradulent
False    2627
True      448
dtype: int64

In [414]:
xfields = [
    'Average Amount/transaction/day',
       'Transaction_amount', 'Is declined', 'Total Number of declines/day',
       'isForeignTransaction', 'isHighRiskCountry', 'Daily_chargeback_avg_amt',
       '6_month_avg_chbk_amt', '6-month_chbk_freq']

df_shuffled = df.sample(frac=1) # shuffle the rows
x = df_shuffled[xfields].to_numpy(dtype=np.float64)
y = df_shuffled["isFradulent"].to_numpy(dtype=np.float64)

# the training data is the first 2000 rows, after shuffled
training_data_x = x[:2000]
training_data_y = y[:2000]

# the test data is the remaining
test_data_x = x[2000:]
test_data_y = y[2000:]
df_shuffled.groupby('isFradulent')['6_month_avg_chbk_amt'].mean()


isFradulent
False     15.824134
True     181.917187
Name: 6_month_avg_chbk_amt, dtype: float64

In [5]:
print("Run this to help you with what number goes with what field:")
for i, x in enumerate(xfields):
    print(f"{i} = {x}")

Run this to help you with what number goes with what field:
0 = Average Amount/transaction/day
1 = Transaction_amount
2 = Is declined
3 = Total Number of declines/day
4 = isForeignTransaction
5 = isHighRiskCountry
6 = Daily_chargeback_avg_amt
7 = 6_month_avg_chbk_amt
8 = 6-month_chbk_freq


In [11]:
# Created an accuracy metric to test hand-engineered classifcation algorithms
def accuracy(y, yhat):
    correct = 0
    for i in range(len(y)):
        if y[i] == yhat[i]:
            correct += 1
    percentage = correct / float(len(y))
    return percentage


In [12]:
# test accuracy function 
acc = accuracy([1, 0, 1], [1, 1, 0])
print(f"Accuracy is {acc}") # should print 0.33...

Accuracy is 0.3333333333333333


In [21]:
def classify_majority(x, theta):
    # whatever the value of x, we return the theta
    return theta

# implement a train majority function that will look for the most common classification 
def train_majority(training_x, training_y):
    # this function will have to determine which is more likely to 
    # be the value of y, one (true) or zero (false)
    count1 = 0
    count0 = 0
    for i in training_y:
        if i == 1:
            count1 += 1
        if i == 0:
            count0 += 1
    truePercentage = count1 / len(training_y)
    falsePercentage = count0 / len(training_y)
    if(truePercentage > 0.50):
        moreLikely = 1
    elif(falsePercentage > 0.50):
        moreLikely = 0
    return moreLikely

In [35]:
# assign theta value to the most common classification 
theta = train_majority(training_data_x, training_data_y)

# Make an array of prediction values based on the theta value
test_data_yhat = [theta for i in range(len(training_data_y))]

# Test accuracy
print("Accuracy: ",accuracy(training_data_y, test_data_yhat))


Accuracy:  0.852


This would beat a classifier that returned random values because a random classifier would at best have 50% chance of having a correct classification. This majority classifier will at worse have a 51% accuracy.

In [178]:
print("Run this to help you with what number goes with what field:")
for i, x in enumerate(xfields):
    print(f"{i} = {x}")

Run this to help you with what number goes with what field:
0 = Average Amount/transaction/day
1 = Transaction_amount
2 = Is declined
3 = Total Number of declines/day
4 = isForeignTransaction
5 = isHighRiskCountry
6 = Daily_chargeback_avg_amt
7 = 6_month_avg_chbk_amt
8 = 6-month_chbk_freq


In [514]:
# implement hand-engineered classification algorithm 
def classify_handwritten(x, theta):
    """Fraudulent transaction classifier. In this test example, we classify every foreign transaction as fraudulent. Transactions larger than theta[0] are also fraudulent"""

    if x[1] > theta:
        if x[5] == 1:
            return 1
        elif x[4] == 1:
            return 1
        elif x[3] > 3:
            return 1
        elif x[2] == 1:
            return 1
        elif x[6] >= 400:
            return 1
    return 0

In [552]:
  
# test hand-engineered classifcation algo with different thetas to find the most accurate one 
theta = [1000 ,2000 ,5000 ,10000 ,23000, 23500 ,25000, 30000]
bestVal = 0
bestTheta = 0
print("Accuracy Values: ")
for i in range(len(theta)):
    for j in range(len(training_data_x)):
        test_data_yhat[j] = classify_handwritten(training_data_x[j], theta[i])
    accuracyVal = accuracy(training_data_y, test_data_yhat)
    print(accuracyVal)
    if accuracyVal > bestVal:
        bestTheta = theta[i]
        bestVal = accuracyVal
print("Best Theta Value :", bestTheta, "Accuracy: ", bestVal)


Accuracy Values: 
0.8335
0.844
0.875
0.902
0.914
0.9135
0.911
0.901
Best Theta Value : 23000 Accuracy:  0.914


In [516]:
# print best theta value and accuracy
print(bestVal)
print (bestTheta)

0.914
23000


Based on my experiments, my hand-engineered classifier was able ot perform better than the majority classifier. My theta values were based upon the transaction amount that would possibly be a fraudulant transaction. The algorithm would then check if the country is a high-risk one and consider different features such as the a foreign transaction or multiple declines. The best accuracy value I received was a 0.914. Originally it was a little difficult to beat the majority classifier as I was overfitted the data, but after began to consider less features to allow for more generalization the accuracy became better.

In [553]:
# implement logistic regression using the sklearn library

def logRegression(max_iter, solver, X_train, y_train, X_test, y_test):
    from sklearn.linear_model import LogisticRegression
    
    bestScore = 0
    bestSolver = ''
    bestIter = 0
    
    logDef = LogisticRegression()
    logDef.fit(X_train, y_train)
    logDef.predict(X_test)
    scoreWithoutParamters = logDef.score(X_test, y_test)
    print("Score with no parameters changed: ", scoreWithoutParamters)
    
    
    
    for i in solver:
        for j in max_iter:
            log = LogisticRegression(solver=i, max_iter=j)
            log.fit(X_train, y_train)
            log.predict(X_test)
            score = log.score(X_test, y_test)
            if score > bestScore:
                bestScore = score
                bestSolver = i
                bestIter = j
    print("Score with best parameters")
    print(bestScore, bestSolver, bestIter)
            
            
    
    

In [555]:
# test the sklearn logistic regression 
import time
start = time.time()
solver = ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']
max_iter = [1000, 5000, 7000, 10000]

print(logRegression(max_iter, solver, training_data_x, training_data_y, test_data_x, test_data_y))
end = time.time()
print("Time: ", end - start)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score with no parameters changed:  0.986046511627907




Score with best parameters
0.9953488372093023 lbfgs 1000
None
Time:  9.076490640640259


Based on the experiments, the Logistic Regression classifier performed better than my hand-engineered classifer. My hand-engineered classifier had an accuracy of 0.914 where as the sklearn Logisitic Regression had an accuracy of 0.995. In my experiments with the sklearn logistic regression, I experimented with the different solvers that library offered and performed various amounts of the iterations on each solver. The solver that performed the best was the "lbfgs" solver that received the accuracy score of 0.995. I also experimented with the logistic regression classifier with no paramters and it performed slightly worse with a score of 0.986. The sklearn logistic classifier was much easier at allowing the tweaking of my parameters however the training time was slightly slower compared to my hand-engineered one.

In [582]:
# Implement the sklearn random forest algorithm 
def randomForest(X_train, y_train, X_test, y_test, n_estimators, max_depth):
    from sklearn.ensemble import RandomForestClassifier
    bestScore = 0
    bestEstimator = 0
    bestDepth = 0
    
    randForest = RandomForestClassifier()
    randForest.fit(X_train, y_train)
    preds = randForest.predict(X_test)
    print("Accuracy with no parameters: " ,randForest.score(X_test, y_test))
    
    for i in n_estimators:
        for j in max_depth:
            randForestParam = RandomForestClassifier(n_estimators=i, max_depth=j)
            randForestParam.fit(X_train, y_train)
            pred = randForestParam.predict(X_test)
            score = randForestParam.score(X_test, y_test)
            if score > bestScore:
                bestScore = score
                bestEstimator = i
                bestDepth = j
    
    print("Best Parameters: n_estimators: ", bestEstimator, "depth: ", bestDepth)
    print("Best Score: ", bestScore)       
    
    

In [583]:
# test the Random Forest algorithm with different parameters
n_estimators = [100, 500, 700, 1000]
max_depth = [None, 1, 5, 10, 15]


randomForest(training_data_x, training_data_y, test_data_x, test_data_y, n_estimators, max_depth)



Accuracy with no parameters:  0.987906976744186
Best Parameters: n_estimators:  100 depth:  10
Best Score:  0.9906976744186047


The experiments I performed for the random forest classifier consisted of experiments the n_estimators and max_depth parameters. I first ran one test with no parameters passed and received an accuracy of 0.987. I then created an array of different values for the two parameters I was working with and ran the classifier through each parameter value. The best accuracy I received was 0.991 with 100 estimators and a depth of 10. Tweaking the parameters made the accuracy slightly better with a 1% increase.

In [584]:
# Implement the sklearn adaboost  
def adaboost(X_train, y_train, X_test, y_test, n_estimators, learning_rate, algo):
    from sklearn.ensemble import AdaBoostClassifier
    
    bestScore = 0
    bestAlgo = ''
    bestRate = 0
    bestEstimator = 0
    
    ada = AdaBoostClassifier()
    ada.fit(X_train, y_train)
    pred = ada.predict(X_test)
    score = ada.score(X_test, y_test)
    print("Score with no parameters: ", score)
    
    for i in algo:
        for j in learning_rate:
            for k in n_estimators:
                adaParam = AdaBoostClassifier(n_estimators=k, learning_rate=j, algorithm=i)
                adaParam.fit(X_train, y_train)
                predParam = adaParam.predict(X_test)
                scoreParam = adaParam.score(X_test, y_test)
                if scoreParam > bestScore:
                    bestScore = scoreParam
                    bestAlgo = i
                    bestRate = j
                    bestEstimator = k
    print("Best Parameters: Algorithm: ", bestAlgo, "Learning Rate: ", bestRate, "n_estimators: ", bestEstimator)
    print("Best Score: ", bestScore)

In [585]:
# test adaboost with different parameters 
n_estimators = [50, 100, 150, 200]
learning_rate = [1.0, 1.5, 2.0, 2.5]
algo = ['SAMME', 'SAMME.R']

adaboost(training_data_x, training_data_y, test_data_x, test_data_y, n_estimators, learning_rate, algo)

Score with no parameters:  0.9944186046511628


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return super().fit(X, y, sample_weight)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return super().fit(X, y, sample_weight)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return super().fit(X, y, sample_weight)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return super().fit(X, y, sample_weight)


Best Parameters: Algorithm:  SAMME.R Learning Rate:  1.0 n_estimators:  100
Best Score:  0.9953488372093023


During my experiments, I tested the Algorithm, Learning_rate, and n_estimators parameters. I first tested adaboost with no parameters and got an accuracy of 0.994. After I created an array of the three parameters I was testing and filled them with various values. I then tested all combinations of these parameter values and got a best accuracy of 0.995 which is a slight improvement from the default test.