# 1.) Import the Credit Card Fraud Data From CCLE

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings 
warnings.filterwarnings(action='ignore')

In [3]:
df = pd.read_csv("fraudTest.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


# 2.) Select four columns to use as features (one just be trans_date_trans)

In [5]:
df_select = df[["trans_date_trans_time", "category", "amt", "city_pop", "is_fraud"]]

In [6]:
df_select.columns

Index(['trans_date_trans_time', 'category', 'amt', 'city_pop', 'is_fraud'], dtype='object')

# 3.) Create a your own variable out of trans_date. Create dummies for factor vars

In [7]:
type(df_select["trans_date_trans_time"][0])

str

In [13]:
df_select["trans_date_trans_time"] = pd.to_datetime(df_select["trans_date_trans_time"])

In [14]:
df_select["time_var"] = [i.second for i in df_select["trans_date_trans_time"]]

In [15]:
X = pd.get_dummies(df_select, ["category"]).drop(["trans_date_trans_time", "is_fraud"], axis = 1)
y = df["is_fraud"]

In [16]:
X.head()

Unnamed: 0,amt,city_pop,time_var,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,2.86,333497,25,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,29.84,302,33,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,41.28,34496,53,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,60.05,54767,15,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,3.19,1126,17,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# 5.) Train a Logistic regression.

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized = scaler.fit_transform(resample_X)

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
log_reg = LogisticRegression().fit(X_normalized, resample_y)

# 6.) The company you are working for wants to target at a False Positive rate of 5% what threshold should you use? (Use oversampled data)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Get predicted probabilities for test set
y_pred_proba = log_reg.predict_proba(X_test)

# Set threshold for predicted probabilities to classify observations as positive or negative
threshold = 0.05
y_pred = (y_pred_proba[:,1] >= threshold).astype(int)

# Create confusion matrix
confusion_matrix(y_test, y_pred)

array([[110557,    126],
       [   461,      0]], dtype=int64)

# 7.) If the company makes .02*amt on True transactions and loses -amt on False (Use original data)

In [23]:
df_temp = df_select.copy()

In [24]:
df_temp["pred"] = log_reg.predict(resample_X)

In [25]:
df_temp = df_temp[["pred", "is_fraud", "amt"]]

In [71]:
df_temp
# Which combinations of "pred" and "if_fraud" do we profit .02*amt, which cases do 
# we lose -amt??

Unnamed: 0,pred,is_fraud,amt,revenue,losses
0,0,0,2.86,True,False
1,0,0,29.84,True,False
2,0,0,41.28,True,False
3,0,0,60.05,True,False
4,0,0,3.19,True,False
...,...,...,...,...,...
555714,0,0,43.77,True,False
555715,0,0,111.84,True,False
555716,0,0,86.88,True,False
555717,0,0,7.99,True,False


In [66]:
# The answer should be one single number
df_temp['revenue'] = (df['is_fraud'] ==0)
df_temp['losses'] = (df['is_fraud'] ==1)
df_temp

rev = df_temp['amt'].loc[df_temp['revenue'] == 1]
loss = df_temp['amt'].loc[df_temp['losses'] == 1]

print(len(rev))
print(len(loss))

553574
2145


In [70]:
print('Net loss from trasactions', + (rev.sum() * .02) - loss.sum())

Net loss from trasactions -384733.11140000017


# 8.) Using Logistic Regression Lasso to inform you. Would you use the selected features in a trusted prediction model?

In [73]:
from sklearn.linear_model import LassoCV

lasso_cv_model = LassoCV(eps=0.1, n_alphas=100, cv=5)

In [74]:
lasso_cv_model.fit(X_train, y_train)

LassoCV(cv=5, eps=0.1)

In [75]:
lasso_cv_model.alpha_

9.385012730022508

In [77]:
test_predictions = lasso_cv_model.predict(X_test)

In [24]:
# If most or all your variables go to 0 => Your data is garbage
# The regularization will tell us if our model has significance
# This of using coefficient strength similar to r^2

In [78]:
lasso_cv_model.coef_

array([ 0.00000000e+00, -9.33420783e-10, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00])

#### Since most of our coefficients go to 0, we can assume the data is not reliable.