In [1]:
from function import *
df = pd.read_csv("../data/creditcard.csv")

In [2]:
df.dropna(inplace= True)

# fine tune oversampling ratio

In [14]:
overSam_methods = [smote_simple, smote_borderline, adasyn_method]
plots = plot_tune_sampling( df, methods= overSam_methods, numStrategies= 10 )

In [4]:
pic_names = ["smote_simple_rf", "smote_simple_xgb", "smote_simple_logReg",
             "smote_borderline_rf", "smote_borderline_xgb", "smote_borderline_logReg",
             "adasyn_method_rf", "adasyn_method_xgb", "adasyn_method_logReg"]

for plot, pic in zip(plots, pic_names):
    plot.figure
    plt.savefig( pic+".png" )

# optimum combined method

In [15]:
underMethods = [random_under_sampling, neighbourhood_clear_rule, nearest_neighbours, KMeansUnderSample]

results = df_tune_sampling( df, methods= underMethods, numStrategies=12 )
results.to_csv('underSampleMetrics.csv')

In [3]:
# Display under-sampling results
results = pd.read_csv('underSampleMetrics.csv')
results.sort_values(by= ['avg_prcs'], ascending= False, inplace= True)
results = results.iloc[:,1:]
results.reset_index(drop= True, inplace= True)
results
# Neighbourhood algorithms do not accept ratio strategy, values are not realistic.

Unnamed: 0,avg_prcs,method,model,ratio
0,0.827055,KMeansUnderSample,xgboost_model,0.003492
1,0.823305,neighbourhood_clear_rule,random_forest,0.45645
2,0.791656,KMeansUnderSample,random_forest,0.003492
3,0.756274,random_under_sampling,random_forest,0.003492
4,0.756274,neighbourhood_clear_rule,xgboost_model,0.003492
5,0.6921,random_under_sampling,xgboost_model,0.003492
6,0.519439,nearest_neighbours,random_forest,0.094083
7,0.447652,nearest_neighbours,xgboost_model,0.003492
8,0.393528,KMeansUnderSample,elasticNet,0.818817
9,0.022663,random_under_sampling,elasticNet,0.909408


In [4]:
X_train, X_test, y_train, y_test = getdataset(df)
X_train, y_train = KMeansUnderSample(X_train, y_train, strategy= 0.003492)

In [9]:
def tune_OverSampling( X_train, y_train, X_test, y_test, methods, numStrategies=6 ):
    '''
        Similar to the previous, this time instead of yielding a plot
        a DataFrame is constructed with the optimal over/under-sampling size.
    '''
    # Handle if not list
    if type(methods) is not list: methods = [ methods ]
    
    if type(y_train) != pd.core.series.Series: # type check to be abe to us VALUE_COUNTS
        y_train = pd.Series( y_train )
    
    # Shrink_factors
    Nmin = y_train.value_counts()[1] # number of observations in minority class
    Nmaj = y_train.value_counts()[0] # #number of observations in majorit class
    factor = np.linspace(1.1, Nmaj/Nmin, numStrategies) # factors to expand minority class
    strategy = (Nmin/Nmaj)*factor 
    
    
    # iterate over methods/ models and plot avg precision
    models = [xgboost_model]
    rows = []
    for method in methods:
        for model in models:
            ratios = []
            avg = []
            for ratio in strategy:
                if (method == adasyn_method) and (ratio < 0.0047133): # limit value before throwing ValueError discovered
                    continue
                else:
                    X_res, y_res = method( X_train, y_train, strategy= ratio )
                    y_pred = model(X_res, y_res, X_test.values)
                    avg.append( average_precision_score(y_test, y_pred) )
                    ratios.append( ratio )
            method_name = re.search(r"\s\w*", str(method))[0]
            model_name = re.search(r"\s\w*", str(model))[0]
            # Find the ratio that maximizes the assessment metric
            t1 = {'method':method_name, 'model':model_name}
            t2 = [{'ratio':ratios[maxInd], 'avg_prcs':maxVal} for maxInd, maxVal in enumerate( avg ) if maxVal == max(avg)]
            rows.append( {**t1, **t2[0]} ) # concatenate in one dict
    return pd.DataFrame( rows )

In [10]:
overMethods = [smote_simple, smote_borderline, adasyn_method]
results_2 = tune_OverSampling( X_train, y_train, X_test, y_test, overMethods, numStrategies=12 )
results_2.to_csv('overSampleMetrics.csv')

In [11]:
# Display under-sampling results
results_2 = pd.read_csv('overSampleMetrics.csv')
results_2.sort_values(by= ['avg_prcs'], ascending= False, inplace= True)
results_2 = results_2.iloc[:,1:]
results_2.reset_index(drop= True, inplace= True)
results_2

Unnamed: 0,avg_prcs,method,model,ratio
0,0.797529,smote_borderline,xgboost_model,0.003841
1,0.6921,smote_simple,xgboost_model,0.003841
2,0.50219,adasyn_method,xgboost_model,0.094401


In [12]:
X_train, y_train = smote_borderline(X_train, y_train, strategy= 0.003841)