In [7]:
!pip install imblearn

Collecting imblearn
  Downloading https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl
Collecting scikit-learn>=0.21 (from imbalanced-learn->imblearn)
[?25l  Downloading https://files.pythonhosted.org/packages/aa/7d/6c71c35c201f6d5cec318c7ed7841317adbf291513742865ed8904ae4ea9/scikit_learn-0.21.2-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (10.5MB)
[K    100% |████████████████████████████████| 10.5MB 2.2MB/s ta 0:00:01    83% |██████████████████████████▋     | 8.7MB 2.0MB/s eta 0:00:01
[?25hCollecting joblib>=0.11 (from imbalanced-learn->imblearn)
[?25l  Downloading https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl (278kB)
[K    100% |████████████████████████████████| 286kB 2.9MB/s ta 0:00:01
[?25hInstalling collected packages: imblearn, jo

In [2]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss

In [8]:
# FUNCTION:

def ridgereg_cv(df, target, method):
    # Logistic regression with ridge 'l2' penalty. Hyperparameter tuning.
    X = df.drop(target, axis=1)
    y = df[target]
    
    # training and testing sets
    X_sample, X_test, y_sample, y_test = \
    train_test_split(X, y, random_state = 42)
    
    if method == 'ros':
        X_train, y_train = RandomOverSampler(random_state=0).fit_resample(X_sample, y_sample)
    if method == 'ADASYN':
        X_train, y_train = ADASYN(random_state=0).fit_resample(X_sample, y_sample)        
    if method == 'SMOTE':
        X_train, y_train = SMOTE(random_state=0).fit_resample(X_sample, y_sample)        
    if method == 'rus':
        X_train, y_train = RandomUnderSampler(random_state=0).fit_resample(X_sample, y_sample)
    if method == 'cc':
        X_train, y_train = ClusterCentroids(random_state=0).fit_resample(X_sample, y_sample)   
    if method == 'NearMiss':
        X_train, y_train = NearMiss(random_state=0).fit_resample(X_sample, y_sample)
    
    param_grid = {'solver': ['lbfgs', 'sag', 'saga'],
                'C': [int(x) for x in np.logspace(0, 1, num = 10)]}
    
    # Instantiate model and grid search
    lgr = LogisticRegression(penalty='l2')
    gm_cv = RandomizedSearchCV(lgr, param_grid, n_iter = 15, cv = 3)
    gm_cv.fit(X_train, y_train)

    # Scores for training and testing
    y_predict_train = gm_cv.predict(X_train)
    print("Train accuracy score:", accuracy_score(y_predict_train, y_train))

    y_predict_test = gm_cv.predict(X_test)
    print("Test accuracy score",accuracy_score(y_predict_test, y_test))

    # Classification reports
    print("\n Training Classification Report:")
    print(classification_report(y_train, y_predict_train))

    print("\n Test Classification Report:")
    print(classification_report(y_test, y_predict_test))
    
    # Best Model
    print(gm_cv.best_estimator_)
    
def lassoreg_cv(df, target, method):
    # Logistic regression with lasso 'l1' penalty. Hyperparameter tuning
    X = df.drop(target, axis=1)
    y = df[target]
    
    # training and testing sets
    X_sample, X_test, y_sample, y_test = \
    train_test_split(X, y, random_state = 42)
    
    if method == 'ros':
        X_train, y_train = RandomOverSampler(random_state=0).fit_resample(X_sample, y_sample)
    if method == 'ADASYN':
        X_train, y_train = ADASYN(random_state=0).fit_resample(X_sample, y_sample)        
    if method == 'SMOTE':
        X_train, y_train = SMOTE(random_state=0).fit_resample(X_sample, y_sample)        
    if method == 'rus':
        X_train, y_train = RandomUnderSampler(random_state=0).fit_resample(X_sample, y_sample)
    if method == 'cc':
        X_train, y_train = ClusterCentroids(random_state=0).fit_resample(X_sample, y_sample)   
    if method == 'NearMiss':
        X_train, y_train = NearMiss(random_state=0).fit_resample(X_sample, y_sample)
    
    param_grid = {'solver': ['liblinear', 'saga'],
              'C': [int(x) for x in np.logspace(0, 1, num = 10)]} 
    
    # Instantiate model and grid search
    lgr = LogisticRegression(penalty='l1', class_weight = "balanced")
    gm_cv = RandomizedSearchCV(lgr, param_grid, n_iter = 15, cv = 3)
    gm_cv.fit(X_train, y_train)

    # Scores for training and testing
    y_predict_train = gm_cv.predict(X_train)
    print("Train accuracy score:", round(accuracy_score(y_predict_train, y_train), 3))

    y_predict_test = gm_cv.predict(X_test)
    print("Test accuracy score", round(accuracy_score(y_predict_test, y_test), 3))

    # Classification reports
    print("\n Training Classification Report:")
    print(classification_report(y_train, y_predict_train))

    print("\n Test Classification Report:")
    print(classification_report(y_test, y_predict_test))
    
    # Best estimator
    print(gm_cv.best_estimator_)

In [4]:
train_final_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_final_data.csv', low_memory=False)

In [5]:
train_final_data.head()

Unnamed: 0,ChronicCond_Alzheimer,ChronicCond_Cancer,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_Heartfailure,ChronicCond_IschemicHeart,ChronicCond_KidneyDisease,ChronicCond_ObstrPulmonary,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,County_0,County_1,County_10,County_100,County_11,County_110,County_111,County_113,County_117,County_120,County_130,County_131,County_14,County_140,County_141,County_150,County_160,County_161,County_170,County_180,County_190,County_191,County_194,County_20,County_200,County_210,County_211,County_212,County_213,County_220,County_221,County_222,County_223,County_224,County_230,County_240,County_241,County_25,County_250,...,proc_9764.0,proc_9784.0,proc_9787.0,proc_9789.0,proc_9805.0,proc_9815.0,proc_9851.0,proc_9903.0,proc_9904.0,proc_9905.0,proc_9906.0,proc_9907.0,proc_9910.0,proc_9914.0,proc_9915.0,proc_9916.0,proc_9917.0,proc_9918.0,proc_9919.0,proc_9920.0,proc_9921.0,proc_9922.0,proc_9923.0,proc_9925.0,proc_9926.0,proc_9928.0,proc_9929.0,proc_9938.0,proc_9939.0,proc_9952.0,proc_9955.0,proc_9959.0,proc_9960.0,proc_9961.0,proc_9962.0,proc_9969.0,proc_9971.0,proc_9972.0,proc_9973.0,proc_9974.0,proc_9975.0,proc_9978.0,proc_9979.0,proc_9982.0,proc_9984.0,proc_9986.0,proc_9992.0,proc_9995.0,proc_9998.0,proc_9999.0
0,0.365759,0.233463,0.451362,0.754864,0.564202,0.762646,0.474708,0.400778,0.272374,0.330739,0.105058,0.011673,0.0,0.011673,0.011673,0.0,0.0,0.0,0.0,0.0,0.0,0.015564,0.0,0.0,0.003891,0.0,0.07393,0.0,0.0,0.0,0.003891,0.011673,0.0,0.0,0.003891,0.0,0.0,0.0,0.0,0.0,0.011673,0.0,0.0,0.0,0.0,0.007782,0.011673,0.0,0.0,0.054475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.426901,0.175439,0.444444,0.730994,0.649123,0.807018,0.473684,0.380117,0.280702,0.345029,0.076023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023392,0.0,0.0,0.005848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070175,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.429515,0.229075,0.451542,0.685022,0.596916,0.799559,0.398678,0.34141,0.370044,0.290749,0.063877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156388,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.496454,0.191489,0.446809,0.77305,0.624113,0.794326,0.460993,0.304965,0.326241,0.326241,0.099291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.322917,0.15625,0.385417,0.645833,0.645833,0.6875,0.395833,0.302083,0.291667,0.270833,0.104167,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.0,0.0,0.0,0.135417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Logistic regression with varied resampling methods
##### Key models  


type    | class weight | Class 1 Recall | Accuracy Score 
------- | ------------ | -------------- | ---------------
* BEST: lasso logistic tuned | balanced | .56   | .897      
ridge, random over sampling | balanced | .914 | .42
ridge, ADASYN | balanced | .914 | .42
ridge, SMOTE | balanced | .914 | .42
BASELINE: general logistic regression | balanced |  .33   | .876 

"*" = best prior model 

In [6]:
train_final_data[['PotentialFraud', 'Provider']].groupby('PotentialFraud').count()

Unnamed: 0_level_0,Provider
PotentialFraud,Unnamed: 1_level_1
0,4904
1,506


### Oversampling Techniques

In [9]:
lassoreg_cv(train_final_data, 'PotentialFraud', 'ros')



Train accuracy score: 1.0
Test accuracy score 0.871

 Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3688
           1       1.00      1.00      1.00      3688

    accuracy                           1.00      7376
   macro avg       1.00      1.00      1.00      7376
weighted avg       1.00      1.00      1.00      7376


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      1216
           1       0.33      0.26      0.29       137

    accuracy                           0.87      1353
   macro avg       0.62      0.60      0.61      1353
weighted avg       0.86      0.87      0.86      1353

LogisticRegression(C=10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
             

In [10]:
lassoreg_cv(train_final_data, 'PotentialFraud', 'ADASYN')



Train accuracy score: 1.0
Test accuracy score 0.856

 Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3688
           1       1.00      1.00      1.00      3799

    accuracy                           1.00      7487
   macro avg       1.00      1.00      1.00      7487
weighted avg       1.00      1.00      1.00      7487


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1216
           1       0.29      0.30      0.30       137

    accuracy                           0.86      1353
   macro avg       0.61      0.61      0.61      1353
weighted avg       0.86      0.86      0.86      1353

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
              

In [11]:
lassoreg_cv(train_final_data, 'PotentialFraud', 'SMOTE')



Train accuracy score: 1.0
Test accuracy score 0.857

 Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3688
           1       1.00      1.00      1.00      3688

    accuracy                           1.00      7376
   macro avg       1.00      1.00      1.00      7376
weighted avg       1.00      1.00      1.00      7376


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1216
           1       0.29      0.28      0.29       137

    accuracy                           0.86      1353
   macro avg       0.60      0.60      0.60      1353
weighted avg       0.86      0.86      0.86      1353

LogisticRegression(C=2, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
              

### Undersampling Techniques

In [12]:
lassoreg_cv(train_final_data, 'PotentialFraud', 'rus')



Train accuracy score: 0.942
Test accuracy score 0.885

 Training Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       369
           1       0.99      0.89      0.94       369

    accuracy                           0.94       738
   macro avg       0.95      0.94      0.94       738
weighted avg       0.95      0.94      0.94       738


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      1216
           1       0.46      0.75      0.57       137

    accuracy                           0.88      1353
   macro avg       0.71      0.83      0.75      1353
weighted avg       0.92      0.88      0.90      1353

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
            

In [13]:
lassoreg_cv(train_final_data, "PotentialFraud", "cc")



Train accuracy score: 0.925
Test accuracy score 0.805

 Training Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       369
           1       0.95      0.90      0.92       369

    accuracy                           0.93       738
   macro avg       0.93      0.93      0.93       738
weighted avg       0.93      0.93      0.93       738


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.83      0.88      1216
           1       0.28      0.59      0.38       137

    accuracy                           0.80      1353
   macro avg       0.61      0.71      0.63      1353
weighted avg       0.88      0.80      0.83      1353

LogisticRegression(C=5, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
            

In [14]:
lassoreg_cv(train_final_data, "PotentialFraud", "NearMiss")



Train accuracy score: 0.977
Test accuracy score 0.665

 Training Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       369
           1       1.00      0.95      0.98       369

    accuracy                           0.98       738
   macro avg       0.98      0.98      0.98       738
weighted avg       0.98      0.98      0.98       738


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.63      0.77      1216
           1       0.22      0.93      0.36       137

    accuracy                           0.67      1353
   macro avg       0.61      0.78      0.57      1353
weighted avg       0.91      0.67      0.73      1353

LogisticRegression(C=4, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
            

### Applications with Ridge Regression

In [None]:
# Top-performing model:
ridgereg_cv(train_final_data, 'PotentialFraud', 'ADASYN')

In [None]:
#top performing model
ridgereg_cv(train_final_data, 'PotentialFraud', 'cc')

In [None]:
#top performing model
ridgereg_cv(train_final_data, 'PotentialFraud', 'NearMiss')