# 1.) Import the Credit Card Fraud Data From CCLE

In [1]:
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np

In [2]:
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [3]:
df = pd.read_csv("/content/gdrive/MyDrive/fraudTest.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


# 2.) Select four columns to use as features (one just be trans_date_trans)

In [5]:
df_select = df[['trans_date_trans_time', 'category', 'amt', 'city_pop', 'is_fraud']]

In [6]:
df_select.columns

Index(['trans_date_trans_time', 'category', 'amt', 'city_pop', 'is_fraud'], dtype='object')

# 3.) Create a unique variable out of trans_date.

In [7]:
df_select['trans_date_trans_time'] = pd.to_datetime(df_select['trans_date_trans_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select['trans_date_trans_time'] = pd.to_datetime(df_select['trans_date_trans_time'])


In [8]:
df_select['time_var'] = [i.second for i in df_select['trans_date_trans_time']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select['time_var'] = [i.second for i in df_select['trans_date_trans_time']]


In [9]:
X = pd.get_dummies(df_select, ['category']).drop(['trans_date_trans_time', 'is_fraud'], axis = 1)
y = df['is_fraud']


In [27]:
X1 = scaler.fit_transform(resample_X)

# 4.) Oversample the data (this will be your training data).

In [10]:
resample_X = X
resample_y = y

# 5.) Train a Logistic regression.

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized = scaler.fit_transform(resample_X)

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
log_reg = LogisticRegression().fit(X_normalized, resample_y)

# 6.) The company you are working for wants to target at a False Positive rate of 5% what threshold should you use? (Use oversampled data)

In [15]:
from sklearn.metrics import roc_curve, confusion_matrix

In [16]:
y_prob = log_reg.predict_proba(X_normalized)[:, 1]

fpr, tpr, thresholds = roc_curve(resample_y, y_prob)

desired_fpr = 0.05
idx = np.argmin(np.abs(fpr - desired_fpr))
threshold = thresholds[idx]

In [17]:
threshold

0.009835524304623314

In [18]:
y_pred = (y_prob > threshold).astype(int)
conf_matrix = confusion_matrix(resample_y, y_pred)
conf_matrix

array([[529101,  24473],
       [   687,   1458]])

# 7.) If the company makes .02*amt on True transactions and loses -amt on False (Use original data)

In [19]:
df_temp = df_select.copy()
df_temp['pred'] = log_reg.predict(resample_X)



In [20]:
df_temp = df_temp[['pred', 'is_fraud', 'amt']]
df_temp

Unnamed: 0,pred,is_fraud,amt
0,0,0,2.86
1,0,0,29.84
2,0,0,41.28
3,0,0,60.05
4,0,0,3.19
...,...,...,...
555714,0,0,43.77
555715,0,0,111.84
555716,0,0,86.88
555717,0,0,7.99


In [24]:
#profit - true negative 
df1 = df_temp[(df_temp['pred'] == 0) & (df_temp['is_fraud'] == 0)]
profit = sum(0.02*df1['amt'])

In [25]:
#loss - false positive
df2 = df_temp[(df_temp['pred'] == 0) & (df_temp['is_fraud'] == 1)]
loss = sum(-1*df2['amt'])

In [26]:
total = profit + loss
total

-35132.440800004755

In [None]:
FP =  #is_fraud = 0, pred = 1

# 8.) Using Logistic Regression Lasso to inform you. Would you use the selected features in a trusted prediction model?

In [28]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression('l1', solver = 'liblinear')
model.fit(X1, resample_y)

LogisticRegression(penalty='l1', solver='liblinear')

In [30]:
model.coef_

array([[ 0.32966553, -0.12165013,  0.00675377, -0.06866481, -0.07648184,
         0.10641193,  0.02366268,  0.42938129, -0.06880098, -0.11334495,
        -0.09720814,  0.32442282,  0.        , -0.01000496,  0.40847039,
         0.10284164, -0.91621942]])

Yes, because most of the coefficient are different from zero. Only one variable was shrinked to 0.