### Importing Library

In [1]:
import pandas as pd, sys
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns
import math
import sklearn
import numpy as np 
#import mlflow

from neustar import onetru
#from neustar import ipaas

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

[92mIntialized[92m


In [2]:
df = pd.read_csv("gs://nb-datasets/creditcard.csv")

In [3]:
df.shape #Dataset details

(284807, 31)

### Separation of input variables from target variable

In [4]:
feature_names = df.iloc[:, 1:30].columns
target = df.iloc[:1, 30:].columns

data_features = df[feature_names]
data_target = df[target]

In [5]:
from sklearn.model_selection import train_test_split
np.random.seed(123)
X_train, X_test, y_train, y_test = train_test_split(data_features, data_target, 
                                                    train_size = 0.70, test_size = 0.30, random_state = 1)

In [6]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, 

### Building the Regression Logistic model

In [7]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

### Training the model

In [8]:
lr.fit(X_train, y_train) #lx:Logistic_Regression Algo.

LogisticRegression()

### Model Saving

In [9]:
#Saving the model

import pickle
filename = 'lr_model.sav'
pickle.dump(lr, open(filename, 'wb'))

### Metric: Measurement

### Confusion Matrix - Model performance measures

In [10]:
def PrintStats(cmat, y_test, pred):
    tpos = cmat[0][0]
    fneg = cmat[1][1]
    fpos = cmat[0][1]
    tneg = cmat[1][0]

In [11]:
def RunModel(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train.values.ravel())
    pred = model.predict(X_test)
    matrix = confusion_matrix(y_test, pred)
    return matrix, pred

### Classification Report - Model performance measures

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

In [13]:
cmat, pred = RunModel(lr, X_train, y_train, X_test, y_test)

In [14]:
ac=accuracy_score(y_test, pred)

In [15]:
ac

0.9991573329588147

In [16]:
print (classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85308
           1       0.83      0.59      0.69       135

    accuracy                           1.00     85443
   macro avg       0.92      0.79      0.84     85443
weighted avg       1.00      1.00      1.00     85443



In [17]:
(rmse, mae) = eval_metrics(y_test, pred)

In [18]:
### Model Registry

In [19]:
onetru.analytics.register_model('demo-7',lr,'scikit_learn',{},{"rmse":rmse,"mae":mae,"Accuracy_score":ac})

Registering model from sklearn


Successfully registered model 'demo-7'.
2022/11/16 10:38:41 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: demo-7, version 1


ModelInfo(artifact_path='demo-7', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.7.12', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.0.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/5e19ad28bf4944a5805543e3d933b813/demo-7', model_uuid='ef4354df251c40fab7b7b066a0dfe249', run_id='5e19ad28bf4944a5805543e3d933b813', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-11-16 10:38:38.069116', mlflow_version='1.28.1.dev0')


Created version '1' of model 'demo-7'.


'run created successfully'

In [20]:
# The function "len" counts the number of classes = 1 and saves it as an object "fraud_records"
fraud_records = len(df[df.Class == 1])

# Defines the index for fraud and non-fraud in the lines:
fraud_indices = df[df.Class == 1].index
not_fraud_indices = df[df.Class == 0].index

# Randomly collect equal samples of each type:
under_sample_indices = np.random.choice(not_fraud_indices, fraud_records, False)
df_undersampled = df.iloc[np.concatenate([fraud_indices, under_sample_indices]),:]
X_undersampled = df_undersampled.iloc[:,1:30]
Y_undersampled = df_undersampled.Class
X_undersampled_train, X_undersampled_test, Y_undersampled_train, Y_undersampled_test = train_test_split(X_undersampled, Y_undersampled, test_size = 0.30)

### Using the "new" classifier for balanced data

In [21]:
lr_undersampled = LogisticRegression()
cmat, pred = RunModel(lr_undersampled, X_undersampled_train, Y_undersampled_train, X_undersampled_test, Y_undersampled_test)
PrintStats(cmat, Y_undersampled_test, pred)

In [22]:
accuracy_score(Y_undersampled_test, pred)

0.9358108108108109

In [23]:
print (classification_report(Y_undersampled_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       143
           1       0.95      0.92      0.94       153

    accuracy                           0.94       296
   macro avg       0.94      0.94      0.94       296
weighted avg       0.94      0.94      0.94       296



In [24]:
lr_undersampled = LogisticRegression()
cmat, pred = RunModel(lr_undersampled, X_undersampled_train, Y_undersampled_train, X_test, y_test)
PrintStats(cmat, y_test, pred)

In [25]:
ac=accuracy_score(y_test, pred)

In [26]:
print (classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98     85308
           1       0.04      0.91      0.08       135

    accuracy                           0.97     85443
   macro avg       0.52      0.94      0.53     85443
weighted avg       1.00      0.97      0.98     85443



###  Parameter optimization || Retrieve logged metadata

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {"C": [1,2,3,4,5,6,7,8,9,10], 
              "penalty": ['l1','l2']} #Parameters
            
grid_search = GridSearchCV(lr, param_grid, scoring="precision") #score
grid_search.fit(y_test, pred)

lr = grid_search.best_estimator_ 
grid_search.best_params_, grid_search.best_score_

({'C': 1, 'penalty': 'l2'}, 0.9126964295629962)

### Prepare Hyperparameters

In [28]:
#new parameters (Better score)
grid = {
    'C': [1e-4, 1e-3, 1e-2],
    'solver': ['lbfgs'],
    'max_iter': [1e4, 1e5],
}

In [29]:
from sklearn import linear_model, model_selection

In [30]:
#Train a model

model = linear_model.LogisticRegression(multi_class='auto')
grid_search = model_selection.GridSearchCV(model, grid,
                                           cv=5, return_train_score=False)
grid_search.fit(y_test, pred)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.0001, 0.001, 0.01],
                         'max_iter': [10000.0, 100000.0], 'solver': ['lbfgs']})

In [31]:
lr = grid_search.best_estimator_ 
grid_search.best_params_, grid_search.best_score_

({'C': 0.0001, 'max_iter': 10000.0, 'solver': 'lbfgs'}, 0.965567688920069)

### Retraining the model (V 2.0)

In [32]:
lr_undersampled = LogisticRegression(C=1, penalty='l2')
cmat, pred = RunModel(lr_undersampled, X_undersampled_train, Y_undersampled_train, X_undersampled_test, Y_undersampled_test)
PrintStats(cmat, Y_undersampled_test, pred)

In [33]:
ac=accuracy_score(Y_undersampled_test, pred)

In [34]:
print (classification_report(Y_undersampled_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       143
           1       0.95      0.92      0.94       153

    accuracy                           0.94       296
   macro avg       0.94      0.94      0.94       296
weighted avg       0.94      0.94      0.94       296



In [35]:
onetru.analytics.register_model('demo-7',lr,'scikit_learn',grid,{"Accuracy_score":ac})

<Experiment: artifact_location='mlflow-artifacts:/7067', experiment_id='7067', lifecycle_stage='active', name='exp_demo-7', tags={'projectId': '91', 'projectName': 'sync-job-test', 'workspaceId': '6'}>
Registering model from sklearn


Registered model 'demo-7' already exists. Creating a new version of this model...
2022/11/16 10:40:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: demo-7, version 2


ModelInfo(artifact_path='demo-7', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.7.12', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.0.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/02f1eb24557d484ba14a8944bc9a045c/demo-7', model_uuid='9d0f243b15c44af4953346936f6d8351', run_id='02f1eb24557d484ba14a8944bc9a045c', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-11-16 10:40:06.193635', mlflow_version='1.28.1.dev0')


Created version '2' of model 'demo-7'.


'run created successfully'

### Application of the Model to the original data test

### Retraining the model (V 2.1)

In [36]:
lr = LogisticRegression(C=1, penalty='l2')
cmat, pred = RunModel(lr, X_undersampled_train, Y_undersampled_train, X_test, y_test)
PrintStats(cmat, y_test, pred)

In [37]:
# skplt.metrics.plot_confusion_matrix(y_test, pred)

In [38]:
ac=accuracy_score(y_test, pred)

In [39]:
print (classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98     85308
           1       0.04      0.91      0.08       135

    accuracy                           0.97     85443
   macro avg       0.52      0.94      0.53     85443
weighted avg       1.00      0.97      0.98     85443



In [40]:
onetru.analytics.register_model('demo-7',lr,'scikit_learn',grid,{"Accuracy_score":ac})

<Experiment: artifact_location='mlflow-artifacts:/7067', experiment_id='7067', lifecycle_stage='active', name='exp_demo-7', tags={'projectId': '91', 'projectName': 'sync-job-test', 'workspaceId': '6'}>
Registering model from sklearn


Registered model 'demo-7' already exists. Creating a new version of this model...
2022/11/16 10:40:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: demo-7, version 3


ModelInfo(artifact_path='demo-7', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.7.12', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.0.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/1fbac613065745d2a8b0ce3728e1be09/demo-7', model_uuid='8744e8f012e34816a216667996539f0e', run_id='1fbac613065745d2a8b0ce3728e1be09', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-11-16 10:40:21.426518', mlflow_version='1.28.1.dev0')


Created version '3' of model 'demo-7'.


'run created successfully'

### Metric : Measurement

In [None]:
from sklearn import metrics                           

In [None]:
clf = LogisticRegression(C=1, penalty='l2')
clf.fit(X_undersampled_train, Y_undersampled_train)
y_pred = clf.predict(X_test)

y_pred_probability = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_probability)
auc = metrics.roc_auc_score(y_test, pred)
plt.plot(fpr,tpr,label="LogisticRegression, auc="+str(auc))
plt.legend(loc=4)
plt.show()