In [75]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

import sklearn.model_selection
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [76]:
train_final_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_final_data.csv', low_memory=False)

### Logistic Regression
* How can we make the model prioritize classifying fraud correctly over classifying no fraud correctly?
* Stats
    * Precision
    * Recall 
    * F1-score
    * support
* Averages
    * micro average
    * macro average
    * weighted average 

In [77]:
# our target, 1, is only in 1/11 of the labels
# issues: 
    # labeling providers as fraudulent when they shouldn't be 
    # labeling providers as not fraudulent when they are fraudulent
train_final_data[['PotentialFraud', 'Provider']].groupby('PotentialFraud').count()

Unnamed: 0_level_0,Provider
PotentialFraud,Unnamed: 1_level_1
0,4904
1,506


In [81]:
def logit_general(df, target):
    # baseline logistic regression. 
    # penalty = 'l2', ridge. , solver = 'liblinear'
    
    # Labels are the dependent variable, transform to array
    labels = np.array(df[target])

    # Feature list of names, and transform to array
    features = df.drop(target, axis = 1)
    feature_list = list(features.columns)
    features = np.array(features)

    # training and testing sets
    train_features, test_features, train_labels, test_labels = \
    train_test_split(features, labels, random_state = 42)

    # Instantiate model
    lgr = LogisticRegression()

    # Train the model on training data
    lgr.fit(train_features, train_labels)

    # Scores for training and testing
    y_predict_train = clf.predict(X_train)
    print("Train accuracy score:",accuracy_score(y_predict_train, y_train))

    y_predict_test = clf.predict(X_test)
    print("Test accuracy score",accuracy_score(y_predict_test, y_test))

    # Classification reports
    print("\n Training Classification Report:")
    print(classification_report(y_train, y_predict_train))

    print("\n Test Classification Report:")
    print(classification_report(y_test, y_predict_test))
    
def lassoreg_cv(df, target):
    # Logistic regression with lasso 'l1' penalty. Hyperparameter tuning
    
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
    
    param_grid = {'solver': ['liblinear', 'saga'],
              'C': [int(x) for x in np.logspace(0, 1, num = 10)]} 
    
    # Instantiate model and grid search
    lgr = LogisticRegression(penalty='l1')
    gm_cv = RandomizedSearchCV(lgr, param_grid, n_iter = 15, cv = 3)
    gm_cv.fit(X_train, y_train)

    # Scores for training and testing
    y_predict_train = gm_cv.predict(X_train)
    print("Train accuracy score:", accuracy_score(y_predict_train, y_train))

    y_predict_test = gm_cv.predict(X_test)
    print("Test accuracy score",accuracy_score(y_predict_test, y_test))

    # Classification reports
    print("\n Training Classification Report:")
    print(classification_report(y_train, y_predict_train))

    print("\n Test Classification Report:")
    print(classification_report(y_test, y_predict_test))
    
    # Best estimator
    print(gm_cv.best_estimator_)
    
    
def ridgereg_cv(df, target):
    # Logistic regression with ridge 'l2' penalty. Hyperparameter tuning.
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
    
    param_grid = {'solver': ['lbfgs', 'sag', 'saga'],
                'C': [int(x) for x in np.logspace(0, 1, num = 10)]}
    
    # Instantiate model and grid search
    lgr = LogisticRegression(penalty='l2')
    gm_cv = RandomizedSearchCV(lgr, param_grid, n_iter = 15, cv = 3)
    gm_cv.fit(X_train, y_train)

    # Scores for training and testing
    y_predict_train = gm_cv.predict(X_train)
    print("Train accuracy score:", accuracy_score(y_predict_train, y_train))

    y_predict_test = gm_cv.predict(X_test)
    print("Test accuracy score",accuracy_score(y_predict_test, y_test))

    # Classification reports
    print("\n Training Classification Report:")
    print(classification_report(y_train, y_predict_train))

    print("\n Test Classification Report:")
    print(classification_report(y_test, y_predict_test))
    
    # Best Model
    print(gm_cv.best_estimator_)


In [82]:
logit_general(train_final_data, 'PotentialFraud')



Train accuracy score: 0.9995070248952428
Test accuracy score 0.8883961566888396

 Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3657
           1       1.00      0.99      1.00       400

   micro avg       1.00      1.00      1.00      4057
   macro avg       1.00      1.00      1.00      4057
weighted avg       1.00      1.00      1.00      4057


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      1247
           1       0.31      0.36      0.33       106

   micro avg       0.89      0.89      0.89      1353
   macro avg       0.63      0.65      0.64      1353
weighted avg       0.90      0.89      0.89      1353





In [83]:
lassoreg_cv(train_final_data, 'PotentialFraud')



Train accuracy score: 0.9831000792183786
Test accuracy score 0.9088108441158349

 Training Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3451
           1       1.00      0.81      0.90       336

   micro avg       0.98      0.98      0.98      3787
   macro avg       0.99      0.91      0.94      3787
weighted avg       0.98      0.98      0.98      3787


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1453
           1       0.59      0.41      0.48       170

   micro avg       0.91      0.91      0.91      1623
   macro avg       0.76      0.69      0.72      1623
weighted avg       0.90      0.91      0.90      1623

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver

In [84]:
ridgereg_cv(train_final_data, 'PotentialFraud')





Train accuracy score: 0.976234486400845
Test accuracy score 0.9137399876771412

 Training Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3451
           1       0.98      0.75      0.85       336

   micro avg       0.98      0.98      0.98      3787
   macro avg       0.98      0.87      0.92      3787
weighted avg       0.98      0.98      0.97      3787


 Test Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1453
           1       0.63      0.42      0.50       170

   micro avg       0.91      0.91      0.91      1623
   macro avg       0.78      0.69      0.73      1623
weighted avg       0.90      0.91      0.91      1623

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver=