# Pipeline

In [1]:
from sklearn.linear_model       import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.model_selection    import train_test_split
from sklearn.model_selection    import cross_validate
from sklearn.model_selection    import cross_val_score
from sklearn.model_selection    import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling     import SMOTENC
from sklearn.pipeline           import Pipeline
from sklearn.preprocessing      import MinMaxScaler
from imblearn.pipeline          import Pipeline
import pandas                   as pd

from sklearn.pipeline           import make_pipeline
from SecurityDataAnalysis       import Crowdfunding
from SecurityDataAnalysis       import Fraud

In [2]:
cf = Crowdfunding()
df_X, y = cf.x_df, cf.y
display(df_X.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._x_df.rename(columns={'보상수': '#ofRewards'}, inplace=True)


Unnamed: 0,Backers,CountryCode,VideoCount,ImageCount,TagCode,Goal,Period,SNS,Fiends,#ofCreation,#ofRewards
0,0,1,0,0,13,4800.0,19,1,26,1,1
1,4,1,0,0,13,10000.0,30,1,2873,11,9
2,0,6,0,0,1,5361.53,30,1,583,1,2
3,1,9,0,3,11,34513.7,30,1,4675,1,4
4,1510,1,0,39,5,5000.0,59,0,0,1,8


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df_X, y, random_state=0, stratify=y)

pipe = Pipeline([("scaler", MinMaxScaler()), ("lr", LogisticRegression())])
pipe.fit(X_train, y_train)
print("Test Score: {:.2f}".format(pipe.score(X_test,y_test)))

Test Score: 0.76


## cross_val_score w/ Pipeline

In [4]:
pipe = Pipeline([("scaler", MinMaxScaler()), ("lr", LogisticRegression())])

print("Default Cross-validation score:", cross_val_score(pipe, X_train, y_train, cv=5))
print("Accuracy score:", cross_val_score(pipe, X_train, y_train, scoring="accuracy", cv=5))
print("Roc_Auc score:", cross_val_score(pipe, X_train, y_train, scoring="roc_auc", cv=5))
print("Precision score:", cross_val_score(pipe, X_train, y_train, scoring="precision", cv=5))
print("Recall score:", cross_val_score(pipe, X_train, y_train, scoring="recall", cv=5))
print("f1 score:", cross_val_score(pipe, X_train, y_train, scoring="f1", cv=5))

Default Cross-validation score: [0.74782609 0.73043478 0.73913043 0.70434783 0.74782609]
Accuracy score: [0.74782609 0.73043478 0.73913043 0.70434783 0.74782609]
Roc_Auc score: [0.75903614 0.78840361 0.82680723 0.69051205 0.81325301]
Precision score: [1.         0.57142857 0.66666667 0.45       0.8       ]
Recall score: [0.09375 0.125   0.125   0.28125 0.125  ]
f1 score: [0.17142857 0.20512821 0.21052632 0.34615385 0.21621622]


## cross_validate w/ Pipeline

In [5]:
pipe = Pipeline([("scaler", MinMaxScaler()), ("lr", LogisticRegression())])
res = cross_validate(pipe, X_train, y_train, scoring=['accuracy', 'roc_auc', 'recall_macro', "recall_weighted", 
                                           "precision_macro", "precision_weighted","precision", "recall", "f1"])
pd.DataFrame(res)

Unnamed: 0,fit_time,score_time,test_accuracy,test_roc_auc,test_recall_macro,test_recall_weighted,test_precision_macro,test_precision_weighted,test_precision,test_recall,test_f1
0,0.005047,0.011034,0.747826,0.759036,0.546875,0.747826,0.870536,0.813121,1.0,0.09375,0.171429
1,0.006238,0.007131,0.730435,0.788404,0.544428,0.730435,0.656085,0.693628,0.571429,0.125,0.205128
2,0.005911,0.00802,0.73913,0.826807,0.550452,0.73913,0.704893,0.721845,0.666667,0.125,0.210526
3,0.005023,0.008008,0.704348,0.690512,0.57436,0.704348,0.603947,0.67222,0.45,0.28125,0.346154
4,0.004983,0.006986,0.747826,0.813253,0.556476,0.747826,0.772727,0.760632,0.8,0.125,0.216216


## GridSearch w/ Pipeline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_X, y, random_state=0)
pipe = Pipeline([("scaler", MinMaxScaler()), ("lr", LogisticRegression())])

# when using pipeline, model name + two underbars + parameter name should be used instead of just paramter name
param_grid = {'lr__C': [0.01, 1, 10, 100],
              'lr__max_iter': [500, 1000, 2000]}
# param_grid = {'C': [0.01, 1, 10, 100],            << 이러지 말라는 뜻! 오류난다.
#               'max_iter': [500, 1000, 2000]}

grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring="f1", cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print(f"Best Score: {grid_search.best_score_:.3f}")
print(f"Best Test Score: {grid_search.score(X_test, y_test):.3f}")

Best parameters: {'lr__C': 100, 'lr__max_iter': 500}
Best Score: 0.580
Best Test Score: 0.430


## make_pipeline

In [7]:
pipe_with_name = Pipeline([("scaler", MinMaxScaler()), ("lr", LogisticRegression())])
pipe_short = make_pipeline(MinMaxScaler(), LogisticRegression())

print("Pipeline Steps:\n", pipe_short.steps)

Pipeline Steps:
 [('minmaxscaler', MinMaxScaler()), ('logisticregression', LogisticRegression())]


## SMOTE

In [8]:
fraud = Fraud()
fraud.head()

Unnamed: 0,type,amount,oldbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,2,9839.64,170136.0,0.0,0.0,0
1,2,1864.28,21249.0,0.0,0.0,0
2,4,181.0,181.0,0.0,0.0,1
3,1,181.0,181.0,21182.0,0.0,1
4,2,11668.14,41554.0,0.0,0.0,0


In [9]:
fraud.make_small()
fraud.prepare_set()
df_X = fraud.df_X
df_y = fraud.df_y

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, random_state=0, stratify=df_y)
print(y_train.value_counts())
print(6160/4765805)
print(y_test.value_counts())

0    476580
1       616
Name: isFraud, dtype: int64
0.001292541344012187
0    158861
1       205
Name: isFraud, dtype: int64


## SMOTE-NC

In [11]:
sm = SMOTENC(categorical_features=[0], random_state = 0, sampling_strategy=0.3, k_neighbors=3)
X_sm_train, y_sm_train = sm.fit_resample(X_train, y_train)
y_sm_train.value_counts()

0    476580
1    142974
Name: isFraud, dtype: int64

In [12]:
tree = DecisionTreeClassifier(random_state=0, max_depth=1)
tree.fit(X_sm_train, y_sm_train)
print("Train Score:",tree.score(X_sm_train, y_sm_train))
print("Test Score:",tree.score(X_test, y_test))

Train Score: 0.8418297678652709
Test Score: 0.9543585681415262


## SMOTENC with Pipeline

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, random_state=0)

pipe = Pipeline([("smotenc", SMOTENC(categorical_features=[0])), ("tree", DecisionTreeClassifier(max_depth=1))])
pipe.fit(X_train, y_train)

print("Test Score: {:.2f}".format(pipe.score(X_test,y_test)))

Test Score: 0.65


## SMOTENC with Pipeline using imblearn.pipeline

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, random_state=0)

smt = SMOTENC(categorical_features=[0])
lr = LogisticRegression(max_iter=1000)

pipeline = Pipeline(steps=[('smote', smt), ('lr', lr)])

# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)


print("Test Score: {:.2f}".format(pipeline.score(X_test,y_test)))

Test Score: 0.43


In [15]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, random_state=0)

pipeline = Pipeline(steps=[('scaler', MinMaxScaler()), 
                           ('smotenc', SMOTENC(categorical_features=[0])), 
                           ('lr', LogisticRegression())])

# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

print("Test Score: {:.2f}".format(pipeline.score(X_test,y_test)))

Test Score: 0.92


In [16]:
pipeline = Pipeline(steps=[('smotenc', SMOTENC(categorical_features=[0])),
                           ('scaler', MinMaxScaler()),
                           ('lr', LogisticRegression())])

# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

print("Test Score: {:.2f}".format(pipeline.score(X_test,y_test)))

Test Score: 0.92


### scaling first or oversampling first?

## SMOTE with Cross_validate Using imblean.pipeline

In [17]:
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()),
                           ('smotenc', SMOTENC(categorical_features=[0])),
                           ('lr', LogisticRegression())])

res = cross_validate(pipeline, df_X, df_y, scoring=['accuracy', 'roc_auc', 'recall_macro', "recall_weighted", 
                                           "precision_macro", "precision_weighted","precision", "recall", "f1"])
pd.DataFrame(res)

Unnamed: 0,fit_time,score_time,test_accuracy,test_roc_auc,test_recall_macro,test_recall_weighted,test_precision_macro,test_precision_weighted,test_precision,test_recall,test_f1
0,7.784979,0.193427,0.918532,0.817239,0.803926,0.918532,0.505199,0.998289,0.010835,0.689024,0.021335
1,7.849821,0.206517,0.921023,0.834764,0.830327,0.921023,0.505839,0.998352,0.012045,0.739394,0.023703
2,7.876716,0.18732,0.920583,0.831739,0.820177,0.920583,0.5056,0.998334,0.011594,0.719512,0.02282
3,7.435799,0.208269,0.919915,0.769256,0.783305,0.919915,0.504928,0.99823,0.010353,0.646341,0.020379
4,7.77302,0.190136,0.919404,0.819129,0.804363,0.919404,0.505258,0.99829,0.010952,0.689024,0.021561


## SMOTE with GridSearch Using imblean.pipeline

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, random_state=0)
pipeline = Pipeline(steps=[('smotenc', SMOTENC(categorical_features=[0])),
                           ('scaler', MinMaxScaler()),
                           ('lr', LogisticRegression())])


# when using pipeline, model name + two underbars + parameter name should be used instead of just paramter name
param_grid = {'lr__C': [0.01, 1, 10, 100],
              'lr__max_iter': [500, 1000, 2000]}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring="f1", cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best Score: {:.3f}".format(grid_search.best_score_))
print("Best Test Score: {:.3f}".format(grid_search.score(X_test, y_test)))
print("Test Set Average Presicision Score: {:.3f}".format(
      average_precision_score(y_test, grid_search.predict_proba(X_test)[:,1])))  
print("Test Set Average Score: {:.3f}".format(
      accuracy_score(y_test, grid_search.predict(X_test))))

Best parameters: {'lr__C': 10, 'lr__max_iter': 2000}
Best Score: 0.023
Best Test Score: 0.021
Test Set Average Presicision Score: 0.025
Test Set Average Score: 0.920


In [19]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, random_state=0)
pipeline = Pipeline(steps=[('scaler', MinMaxScaler()),
                           ('smotenc', SMOTENC(categorical_features=[0])),
                           ('lr', LogisticRegression())])


# when using pipeline, model name + two underbars + parameter name should be used instead of just paramter name
param_grid = {'lr__C': [0.01, 1, 10, 100],
              'lr__max_iter': [500, 1000, 2000]}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring="f1", cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best Score: {:.3f}".format(grid_search.best_score_))
print("Best Test Score: {:.3f}".format(grid_search.score(X_test, y_test)))
print("Test Set Average Presicision Score: {:.3f}".format(
      average_precision_score(y_test, grid_search.predict_proba(X_test)[:,1])))  
print("Test Set Average Score: {:.3f}".format(
      accuracy_score(y_test, grid_search.predict(X_test))))

Best parameters: {'lr__C': 10, 'lr__max_iter': 1000}
Best Score: 0.023
Best Test Score: 0.021
Test Set Average Presicision Score: 0.026
Test Set Average Score: 0.921
