In [1]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### Logistic Regression
* How can we make the model prioritize classifying fraud correctly over classifying no fraud correctly?
* Stats
    * Precision: TP / (TP+FP), of all instances classified as True, what percentage are really True?
    * Recall: TP / (TP+FN), of all instances which are actually True, what percentage did we classify as true? 
    * F1-score: 2*(precision*recall)/(precision+recall)
    * support: number of occurrences per class
* Averages
    * micro average - sum of (TP) / (TP + FN) across all classes
    * macro average - average of each class' recall
    * weighted average 

### LOGISTIC REGRESSION RESULTS
* Precision: TP / (TP+FP)
    * high precision: few false positives
    * high precision for class 1: few providers we identify as fraudulent are not fraudulent
    * if we have high precision, few providers are being falsely accused of fraud
* Recall: TP / (TP+FN)
    * high recall: few false negatives
    * high recall for class 1: few providers we identify as not fraudulent are fraudulent
    * if we have high recall, few providers are getting away with fraud
* F1-score: 2*(precision*recall)/(precision+recall)
* Support: number of occurrences per class
* Averages
    * micro average - sum of (TP) / (TP + FN) across all classes
    * macro average - average of each class' recall
    * weighted average 

#### We will choose "Accuracy Score" and "Recall" of Class 1 as key important metrics
Finding a high percentage of fradulent cases is most important. False accuastions of fraud are less concerning than missed accusations.

###### Key models  


type    | class weight | Class 1 Recall | Accuracy Score 
------- | ------------ | -------------- | ---------------
BEST: lasso logistic tuned | balanced | .56   | .897
ridge logistic tuned | balanced | .55 | .897
general logistic regression | 1:5.8 |.35   | .878
BASELINE: general logistic regression | balanced |  .33   | .876

In [17]:
def logit_general(df, target):
    # baseline logistic regression. 
    # penalty = 'l2', ridge. , solver = 'liblinear'
    X = df.drop(target, axis=1)
    y = df[target]

    # training and testing sets
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, random_state = 42, stratify=y)

    # Instantiate model
    lgr = LogisticRegression(class_weight='balanced')

    # Train the model on training data
    lgr.fit(X_train, y_train)

    # Scores for training and testing
    y_predict_train = lgr.predict(X_train)
    print("Train accuracy score:", round(accuracy_score(y_predict_train, y_train), 3))

    y_predict_test = lgr.predict(X_test)
    print("Test accuracy score", round(accuracy_score(y_predict_test, y_test), 3))

    # Classification reports
    print("\n Training Classification Report:")
    print(classification_report(y_train, y_predict_train))

    print("\n Test Classification Report:")
    print(classification_report(y_test, y_predict_test))
    
    # Model with higher weight towards class 1.
    for i in np.linspace(5, 6, 6):
        weight_dict = {0: 1, 1:i}
        lgr = LogisticRegression(class_weight= weight_dict)
        lgr.fit(X_train, y_train)
        y_predict_train = lgr.predict(X_train)
        y_predict_test = lgr.predict(X_test)
        print("Weighted Test accuracy score, w=", i, ":", accuracy_score(y_predict_test, y_test))
        print("\n Weighted Test Classification Report, w=", i, ":")
        print(classification_report(y_test, y_predict_test))
    
def lassoreg_cv(df, target):
    # Logistic regression with lasso 'l1' penalty. Hyperparameter tuning
    
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42, stratify=y)
    
    param_grid = {'solver': ['liblinear', 'saga'],
              'C': [int(x) for x in np.logspace(0, 1, num = 10)]} 
    
    # Instantiate model and grid search
    lgr = LogisticRegression(penalty='l1', class_weight = "balanced")
    gm_cv = RandomizedSearchCV(lgr, param_grid, n_iter = 15, cv = 3)
    gm_cv.fit(X_train, y_train)

    # Scores for training and testing
    y_predict_train = gm_cv.predict(X_train)
    print("Train accuracy score:", round(accuracy_score(y_predict_train, y_train), 3))

    y_predict_test = gm_cv.predict(X_test)
    print("Test accuracy score", round(accuracy_score(y_predict_test, y_test), 3))

    # Classification reports
    print("\n Training Classification Report:")
    print(classification_report(y_train, y_predict_train))

    print("\n Test Classification Report:")
    print(classification_report(y_test, y_predict_test))
    
    # Best estimator
    print(gm_cv.best_estimator_)
    
    
def ridgereg_cv(df, target):
    # Logistic regression with ridge 'l2' penalty. Hyperparameter tuning.
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42, stratify=y)
    
    param_grid = {'solver': ['lbfgs', 'sag', 'saga'],
                'C': [int(x) for x in np.logspace(0, 1, num = 10)]}
    
    # Instantiate model and grid search
    lgr = LogisticRegression(penalty='l2', class_weight="balanced")
    gm_cv = RandomizedSearchCV(lgr, param_grid, n_iter = 15, cv = 3)
    gm_cv.fit(X_train, y_train)

    # Scores for training and testing
    y_predict_train = gm_cv.predict(X_train)
    print("Train accuracy score:", round(accuracy_score(y_predict_train, y_train), 3))

    y_predict_test = gm_cv.predict(X_test)
    print("Test accuracy score", round(accuracy_score(y_predict_test, y_test), 3))

    # Classification reports
    print("\n Training Classification Report:")
    print(classification_report(y_train, y_predict_train))

    print("\n Test Classification Report:")
    print(classification_report(y_test, y_predict_test))
    
    # Best Model
    print(gm_cv.best_estimator_)

def ridge_cv_weights(df, target):
    # checking different weights with best cv ridge model
    X = df.drop(target, axis=1)
    y = df[target]

    # training and testing sets
    X_train, X_test, y_train, y_test = \
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42, stratify=y)

    # Model with higher weight towards class 1.
    for i in np.linspace(1, 10, 6):
        weight_dict = {0: 1, 1:i}
        lgr = LogisticRegression(C=7, class_weight=i, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='saga', tol=0.0001, verbose=0, warm_start=False)
        lgr.fit(X_train, y_train)
        y_predict_train = lgr.predict(X_train)
        y_predict_test = lgr.predict(X_test)
        print("Weighted Test accuracy score, w=", i, ":", round(accuracy_score(y_predict_test, y_test), 3))
        print("\n Weighted Test Classification Report, w=", i, ":")
        print(classification_report(y_test, y_predict_test))

In [3]:
train_final_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_final_data.csv', low_memory=False)

In [4]:
train_final_data.head()

Unnamed: 0,ChronicCond_Alzheimer,ChronicCond_Cancer,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_Heartfailure,ChronicCond_IschemicHeart,ChronicCond_KidneyDisease,ChronicCond_ObstrPulmonary,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,County_0,County_1,County_10,County_100,County_11,County_110,County_111,County_113,County_117,County_120,County_130,County_131,County_14,County_140,County_141,County_150,County_160,County_161,County_170,County_180,County_190,County_191,County_194,County_20,County_200,County_210,County_211,County_212,County_213,County_220,County_221,County_222,County_223,County_224,County_230,County_240,County_241,County_25,County_250,...,proc_9764.0,proc_9784.0,proc_9787.0,proc_9789.0,proc_9805.0,proc_9815.0,proc_9851.0,proc_9903.0,proc_9904.0,proc_9905.0,proc_9906.0,proc_9907.0,proc_9910.0,proc_9914.0,proc_9915.0,proc_9916.0,proc_9917.0,proc_9918.0,proc_9919.0,proc_9920.0,proc_9921.0,proc_9922.0,proc_9923.0,proc_9925.0,proc_9926.0,proc_9928.0,proc_9929.0,proc_9938.0,proc_9939.0,proc_9952.0,proc_9955.0,proc_9959.0,proc_9960.0,proc_9961.0,proc_9962.0,proc_9969.0,proc_9971.0,proc_9972.0,proc_9973.0,proc_9974.0,proc_9975.0,proc_9978.0,proc_9979.0,proc_9982.0,proc_9984.0,proc_9986.0,proc_9992.0,proc_9995.0,proc_9998.0,proc_9999.0
0,0.365759,0.233463,0.451362,0.754864,0.564202,0.762646,0.474708,0.400778,0.272374,0.330739,0.105058,0.011673,0.0,0.011673,0.011673,0.0,0.0,0.0,0.0,0.0,0.0,0.015564,0.0,0.0,0.003891,0.0,0.07393,0.0,0.0,0.0,0.003891,0.011673,0.0,0.0,0.003891,0.0,0.0,0.0,0.0,0.0,0.011673,0.0,0.0,0.0,0.0,0.007782,0.011673,0.0,0.0,0.054475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.426901,0.175439,0.444444,0.730994,0.649123,0.807018,0.473684,0.380117,0.280702,0.345029,0.076023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023392,0.0,0.0,0.005848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070175,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.429515,0.229075,0.451542,0.685022,0.596916,0.799559,0.398678,0.34141,0.370044,0.290749,0.063877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156388,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.496454,0.191489,0.446809,0.77305,0.624113,0.794326,0.460993,0.304965,0.326241,0.326241,0.099291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.322917,0.15625,0.385417,0.645833,0.645833,0.6875,0.395833,0.302083,0.291667,0.270833,0.104167,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.0,0.0,0.0,0.135417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train_final_data[['PotentialFraud', 'Provider']].groupby('PotentialFraud').count()

Unnamed: 0_level_0,Provider
PotentialFraud,Unnamed: 1_level_1
0,4904
1,506


In [6]:
logit_general(train_final_data, 'PotentialFraud')



Train accuracy score: 1.0
Test accuracy score 0.876

 Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3678
           1       1.00      1.00      1.00       379

    accuracy                           1.00      4057
   macro avg       1.00      1.00      1.00      4057
weighted avg       1.00      1.00      1.00      4057


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1226
           1       0.34      0.33      0.33       127

    accuracy                           0.88      1353
   macro avg       0.63      0.63      0.63      1353
weighted avg       0.87      0.88      0.88      1353





Weight Test accuracy score, w= 5.0 : 0.8787878787878788

 Weighted Test Classification Report, w= 5.0 :
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1226
           1       0.35      0.34      0.34       127

    accuracy                           0.88      1353
   macro avg       0.64      0.64      0.64      1353
weighted avg       0.88      0.88      0.88      1353





Weight Test accuracy score, w= 5.2 : 0.8780487804878049

 Weighted Test Classification Report, w= 5.2 :
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1226
           1       0.35      0.34      0.34       127

    accuracy                           0.88      1353
   macro avg       0.64      0.64      0.64      1353
weighted avg       0.88      0.88      0.88      1353





Weight Test accuracy score, w= 5.4 : 0.8780487804878049

 Weighted Test Classification Report, w= 5.4 :
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1226
           1       0.34      0.33      0.34       127

    accuracy                           0.88      1353
   macro avg       0.64      0.63      0.64      1353
weighted avg       0.88      0.88      0.88      1353





Weight Test accuracy score, w= 5.6 : 0.8780487804878049

 Weighted Test Classification Report, w= 5.6 :
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1226
           1       0.34      0.33      0.34       127

    accuracy                           0.88      1353
   macro avg       0.64      0.63      0.64      1353
weighted avg       0.88      0.88      0.88      1353





Weight Test accuracy score, w= 5.8 : 0.8780487804878049

 Weighted Test Classification Report, w= 5.8 :
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1226
           1       0.35      0.35      0.35       127

    accuracy                           0.88      1353
   macro avg       0.64      0.64      0.64      1353
weighted avg       0.88      0.88      0.88      1353

Weight Test accuracy score, w= 6.0 : 0.8780487804878049

 Weighted Test Classification Report, w= 6.0 :
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1226
           1       0.35      0.34      0.34       127

    accuracy                           0.88      1353
   macro avg       0.64      0.64      0.64      1353
weighted avg       0.88      0.88      0.88      1353





In [19]:
lassoreg_cv(train_final_data, 'PotentialFraud')



Train accuracy score: 0.953
Test accuracy score 0.897

 Training Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      3433
           1       0.68      0.94      0.79       354

    accuracy                           0.95      3787
   macro avg       0.84      0.95      0.88      3787
weighted avg       0.96      0.95      0.96      3787


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1471
           1       0.46      0.56      0.50       152

    accuracy                           0.90      1623
   macro avg       0.71      0.75      0.72      1623
weighted avg       0.91      0.90      0.90      1623

LogisticRegression(C=2, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
            

In [22]:
#top performing model, with class weight = 1 
ridgereg_cv(train_final_data, 'PotentialFraud')





Train accuracy score: 0.9527330340639029
Test accuracy score 0.8971041281577325

 Training Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      3433
           1       0.68      0.94      0.79       354

   micro avg       0.95      0.95      0.95      3787
   macro avg       0.84      0.94      0.88      3787
weighted avg       0.96      0.95      0.96      3787


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1471
           1       0.46      0.55      0.50       152

   micro avg       0.90      0.90      0.90      1623
   macro avg       0.71      0.74      0.72      1623
weighted avg       0.91      0.90      0.90      1623

LogisticRegression(C=7, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,


In [18]:
ridge_cv_weights(train_final_data, "PotentialFraud")



Weight Test accuracy score, w= 1.0 : 0.9164818920916482

 Weighted Test Classification Report, w= 1.0 :
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1226
           1       0.60      0.34      0.43       127

    accuracy                           0.92      1353
   macro avg       0.77      0.66      0.69      1353
weighted avg       0.90      0.92      0.91      1353





Weight Test accuracy score, w= 2.8 : 0.9157427937915743

 Weighted Test Classification Report, w= 2.8 :
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1226
           1       0.59      0.33      0.42       127

    accuracy                           0.92      1353
   macro avg       0.76      0.65      0.69      1353
weighted avg       0.90      0.92      0.90      1353





Weight Test accuracy score, w= 4.6 : 0.9157427937915743

 Weighted Test Classification Report, w= 4.6 :
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1226
           1       0.59      0.33      0.42       127

    accuracy                           0.92      1353
   macro avg       0.76      0.65      0.69      1353
weighted avg       0.90      0.92      0.90      1353





Weight Test accuracy score, w= 6.4 : 0.9164818920916482

 Weighted Test Classification Report, w= 6.4 :
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1226
           1       0.60      0.34      0.43       127

    accuracy                           0.92      1353
   macro avg       0.77      0.66      0.69      1353
weighted avg       0.90      0.92      0.91      1353





Weight Test accuracy score, w= 8.2 : 0.9157427937915743

 Weighted Test Classification Report, w= 8.2 :
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1226
           1       0.59      0.33      0.42       127

    accuracy                           0.92      1353
   macro avg       0.76      0.65      0.69      1353
weighted avg       0.90      0.92      0.90      1353





Weight Test accuracy score, w= 10.0 : 0.9164818920916482

 Weighted Test Classification Report, w= 10.0 :
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1226
           1       0.60      0.34      0.43       127

    accuracy                           0.92      1353
   macro avg       0.77      0.66      0.69      1353
weighted avg       0.90      0.92      0.91      1353

